Ray Train Axolotl Integration (#2251)

* current not clean working version move torch trainer to do_cli update code with config changes and clean up edit config cleanup add run name to trainer * address comments * use axolotl train in multigpu tests and add ray tests for multi-gpu * accelerate uses underscores for main_process_port arg * chore: lint * fix order of accelerate args * include ray train in docker images * current not clean working version move torch trainer to do_cli update code with config changes and clean up edit config cleanup add run name to trainer * address comments * use axolotl train in multigpu tests and add ray tests for multi-gpu * accelerate uses underscores for main_process_port arg * chore: lint * fix order of accelerate args * include ray train in docker images * fix bf16 resolution behavior * move dtype logic * x Signed-off-by: SumanthRH <sumanthrh@anyscale.com> * rename Signed-off-by: SumanthRH <sumanthrh@anyscale.com> * add to sidebar Signed-off-by: SumanthRH <sumanthrh@anyscale.com> * Apply suggestions from code review Co-authored-by: Eric Tang <46737979+erictang000@users.noreply.github.com> * Update docs/ray-integration.qmd Co-authored-by: Eric Tang <46737979+erictang000@users.noreply.github.com> * pre-commit fixes Signed-off-by: SumanthRH <sumanthrh@anyscale.com> * use output_dir instead of hardcoded saves path Co-authored-by: NanoCode012 <kevinvong@rocketmail.com> * bugfix storage dir * change type\ for resources_per_worker --------- Signed-off-by: SumanthRH <sumanthrh@anyscale.com> Co-authored-by: Wing Lian <wing@axolotl.ai> Co-authored-by: SumanthRH <sumanthrh@anyscale.com> Co-authored-by: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com> Co-authored-by: Wing Lian <wing.lian@gmail.com> Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
2025-01-28 21:10:19 -08:00
parent 54dd7abfc1
commit 268543a3be
16 changed files with 492 additions and 100 deletions
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -74,15 +74,13 @@ class TestMultiGPULlama:

        execute_subprocess_async(
            [
-                "accelerate",
-                "launch",
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
-                "--main_process_port",
+                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
            ]
        )

@@ -139,15 +137,13 @@ class TestMultiGPULlama:

        execute_subprocess_async(
            [
-                "accelerate",
-                "launch",
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
-                "--main_process_port",
+                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
            ]
        )

@@ -214,15 +210,13 @@ class TestMultiGPULlama:

        execute_subprocess_async(
            [
-                "accelerate",
-                "launch",
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
-                "--main_process_port",
+                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
            ]
        )

@@ -293,15 +287,13 @@ class TestMultiGPULlama:

        execute_subprocess_async(
            [
-                "accelerate",
-                "launch",
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
-                "--main_process_port",
+                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
            ]
        )

@@ -367,15 +359,13 @@ class TestMultiGPULlama:

        execute_subprocess_async(
            [
-                "accelerate",
-                "launch",
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
-                "--main_process_port",
+                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
            ]
        )

@@ -439,15 +429,13 @@ class TestMultiGPULlama:

        execute_subprocess_async(
            [
-                "accelerate",
-                "launch",
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
-                "--main_process_port",
+                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
            ]
        )

@@ -520,15 +508,13 @@ class TestMultiGPULlama:

        execute_subprocess_async(
            [
-                "accelerate",
-                "launch",
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
-                "--main_process_port",
+                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
            ]
        )

@@ -605,15 +591,13 @@ class TestMultiGPULlama:

        execute_subprocess_async(
            [
-                "accelerate",
-                "launch",
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
-                "--main_process_port",
+                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
            ]
        )

@@ -680,15 +664,13 @@ class TestMultiGPULlama:

        execute_subprocess_async(
            [
-                "accelerate",
-                "launch",
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
-                "--main_process_port",
+                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
            ]
        )

@@ -755,15 +737,13 @@ class TestMultiGPULlama:

        execute_subprocess_async(
            [
-                "accelerate",
-                "launch",
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
-                "--main_process_port",
+                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
            ]
        )