Ray Train Axolotl Integration (#2251)

* current

not clean working version
move torch trainer to do_cli
update code with config changes and clean up
edit config
cleanup
add run name to trainer

* address comments

* use axolotl train in multigpu tests and add ray tests for multi-gpu

* accelerate uses underscores for main_process_port arg

* chore: lint

* fix order of accelerate args

* include ray train in docker images

* current

not clean working version
move torch trainer to do_cli
update code with config changes and clean up
edit config
cleanup
add run name to trainer

* address comments

* use axolotl train in multigpu tests and add ray tests for multi-gpu

* accelerate uses underscores for main_process_port arg

* chore: lint

* fix order of accelerate args

* include ray train in docker images

* fix bf16 resolution behavior

* move dtype logic

* x

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>

* rename

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>

* add to sidebar

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>

* Apply suggestions from code review

Co-authored-by: Eric Tang <46737979+erictang000@users.noreply.github.com>

* Update docs/ray-integration.qmd

Co-authored-by: Eric Tang <46737979+erictang000@users.noreply.github.com>

* pre-commit fixes

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>

* use output_dir instead of hardcoded saves path

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>

* bugfix storage dir

* change type\ for resources_per_worker

---------

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
Co-authored-by: Wing Lian <wing@axolotl.ai>
Co-authored-by: SumanthRH <sumanthrh@anyscale.com>
Co-authored-by: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com>
Co-authored-by: Wing Lian <wing.lian@gmail.com>
Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
This commit is contained in:
Eric Tang
2025-01-28 21:10:19 -08:00
committed by GitHub
parent 54dd7abfc1
commit 268543a3be
16 changed files with 492 additions and 100 deletions

View File

@@ -74,15 +74,13 @@ class TestMultiGPULlama:
execute_subprocess_async(
[
"accelerate",
"launch",
"axolotl",
"train",
str(Path(temp_dir) / "config.yaml"),
"--num-processes",
"2",
"--main_process_port",
"--main-process-port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)
@@ -139,15 +137,13 @@ class TestMultiGPULlama:
execute_subprocess_async(
[
"accelerate",
"launch",
"axolotl",
"train",
str(Path(temp_dir) / "config.yaml"),
"--num-processes",
"2",
"--main_process_port",
"--main-process-port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)
@@ -214,15 +210,13 @@ class TestMultiGPULlama:
execute_subprocess_async(
[
"accelerate",
"launch",
"axolotl",
"train",
str(Path(temp_dir) / "config.yaml"),
"--num-processes",
"2",
"--main_process_port",
"--main-process-port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)
@@ -293,15 +287,13 @@ class TestMultiGPULlama:
execute_subprocess_async(
[
"accelerate",
"launch",
"axolotl",
"train",
str(Path(temp_dir) / "config.yaml"),
"--num-processes",
"2",
"--main_process_port",
"--main-process-port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)
@@ -367,15 +359,13 @@ class TestMultiGPULlama:
execute_subprocess_async(
[
"accelerate",
"launch",
"axolotl",
"train",
str(Path(temp_dir) / "config.yaml"),
"--num-processes",
"2",
"--main_process_port",
"--main-process-port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)
@@ -439,15 +429,13 @@ class TestMultiGPULlama:
execute_subprocess_async(
[
"accelerate",
"launch",
"axolotl",
"train",
str(Path(temp_dir) / "config.yaml"),
"--num-processes",
"2",
"--main_process_port",
"--main-process-port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)
@@ -520,15 +508,13 @@ class TestMultiGPULlama:
execute_subprocess_async(
[
"accelerate",
"launch",
"axolotl",
"train",
str(Path(temp_dir) / "config.yaml"),
"--num-processes",
"2",
"--main_process_port",
"--main-process-port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)
@@ -605,15 +591,13 @@ class TestMultiGPULlama:
execute_subprocess_async(
[
"accelerate",
"launch",
"axolotl",
"train",
str(Path(temp_dir) / "config.yaml"),
"--num-processes",
"2",
"--main_process_port",
"--main-process-port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)
@@ -680,15 +664,13 @@ class TestMultiGPULlama:
execute_subprocess_async(
[
"accelerate",
"launch",
"axolotl",
"train",
str(Path(temp_dir) / "config.yaml"),
"--num-processes",
"2",
"--main_process_port",
"--main-process-port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)
@@ -755,15 +737,13 @@ class TestMultiGPULlama:
execute_subprocess_async(
[
"accelerate",
"launch",
"axolotl",
"train",
str(Path(temp_dir) / "config.yaml"),
"--num-processes",
"2",
"--main_process_port",
"--main-process-port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)