chore: logging cleanup (#3482) [skip ci]
This commit is contained in:
@@ -196,12 +196,10 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
|
|||||||
state.wait_for_everyone()
|
state.wait_for_everyone()
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"FSDP SHARDED_STATE_DICT weights successfully merged to: {output_path}",
|
f"FSDP SHARDED_STATE_DICT weights successfully merged to: {output_path}",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
LOG.info(
|
LOG.info(
|
||||||
"Merged weights are only the safetensors and doesn't include the model configuration "
|
"Merged weights are only the safetensors and doesn't include the model configuration "
|
||||||
f"or tokenizer which may be found in {parsed_cfg.output_dir}.",
|
f"or tokenizer which may be found in {parsed_cfg.output_dir}.",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -19,5 +19,4 @@ class CheckpointSaveMixin(Trainer):
|
|||||||
f"Trainer does not support saving optimizer and scheduler: {exc}\n"
|
f"Trainer does not support saving optimizer and scheduler: {exc}\n"
|
||||||
"Optimizer and scheduler states were not saved - resuming from checkpoints "
|
"Optimizer and scheduler states were not saved - resuming from checkpoints "
|
||||||
"for this training run will not be possible.",
|
"for this training run will not be possible.",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -64,15 +64,12 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
|
|||||||
LOG.info(
|
LOG.info(
|
||||||
"Compiling flex attention with kwargs: %s. This may take a while...",
|
"Compiling flex attention with kwargs: %s. This may take a while...",
|
||||||
flex_attn_compile_kwargs,
|
flex_attn_compile_kwargs,
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
self._compiled_flex_attention = torch.compile(
|
self._compiled_flex_attention = torch.compile(
|
||||||
flex_attention,
|
flex_attention,
|
||||||
**flex_attn_compile_kwargs,
|
**flex_attn_compile_kwargs,
|
||||||
)
|
)
|
||||||
LOG.info(
|
LOG.info("Flex attention compiled successfully.")
|
||||||
"Flex attention compiled successfully.", main_process_only=True
|
|
||||||
)
|
|
||||||
|
|
||||||
self._is_flex_compiled = True
|
self._is_flex_compiled = True
|
||||||
|
|
||||||
|
|||||||
@@ -154,7 +154,6 @@ def register_ring_attn_from_device_mesh(
|
|||||||
LOG.info(
|
LOG.info(
|
||||||
f"Enabling ring attention sequence parallelism using DeviceMesh "
|
f"Enabling ring attention sequence parallelism using DeviceMesh "
|
||||||
f"dimension '{context_parallel_dim}'",
|
f"dimension '{context_parallel_dim}'",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract the sequence parallel submesh
|
# Extract the sequence parallel submesh
|
||||||
|
|||||||
@@ -85,7 +85,6 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
|
|||||||
mlp_cls._tiled_mlp_dist_impl = None
|
mlp_cls._tiled_mlp_dist_impl = None
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Successfully monkey-patched TiledMLP for model_type: {model_type}",
|
f"Successfully monkey-patched TiledMLP for model_type: {model_type}",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
except (ImportError, AttributeError) as e:
|
except (ImportError, AttributeError) as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
|||||||
@@ -69,7 +69,6 @@ def setup_model_and_tokenizer(
|
|||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
LOG.debug(
|
LOG.debug(
|
||||||
f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
|
f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
tokenizer = load_tokenizer(cfg)
|
tokenizer = load_tokenizer(cfg)
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,6 @@ class DynamicCheckpointCallback(TrainerCallback):
|
|||||||
f"Dynamic checkpoint enabled. To trigger checkpoint save:\n"
|
f"Dynamic checkpoint enabled. To trigger checkpoint save:\n"
|
||||||
f" • File: touch {cfg.output_dir}/{self.trigger_filename}\n"
|
f" • File: touch {cfg.output_dir}/{self.trigger_filename}\n"
|
||||||
f" • Check interval: every {self.check_interval} steps",
|
f" • Check interval: every {self.check_interval} steps",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def on_step_end(
|
def on_step_end(
|
||||||
@@ -89,12 +88,10 @@ class DynamicCheckpointCallback(TrainerCallback):
|
|||||||
LOG.info(
|
LOG.info(
|
||||||
f"Dynamic checkpoint triggered via file '{self.trigger_filename}' "
|
f"Dynamic checkpoint triggered via file '{self.trigger_filename}' "
|
||||||
f"at step {state.global_step}",
|
f"at step {state.global_step}",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
except OSError as exc:
|
except OSError as exc:
|
||||||
LOG.warning(
|
LOG.warning(
|
||||||
f"Failed to delete trigger file: {exc}",
|
f"Failed to delete trigger file: {exc}",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.should_save_checkpoint:
|
if self.should_save_checkpoint:
|
||||||
@@ -127,6 +124,5 @@ class DynamicCheckpointCallback(TrainerCallback):
|
|||||||
control.should_save = True
|
control.should_save = True
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Saving dynamic checkpoint at step {state.global_step}",
|
f"Saving dynamic checkpoint at step {state.global_step}",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
return control
|
return control
|
||||||
|
|||||||
@@ -474,13 +474,11 @@ def load_preprocessed_dataset(cfg: DictDefault, dataset_hash: str) -> Dataset |
|
|||||||
):
|
):
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Loading prepared dataset from disk at {prepared_ds_path}...",
|
f"Loading prepared dataset from disk at {prepared_ds_path}...",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
return load_from_disk(str(prepared_ds_path))
|
return load_from_disk(str(prepared_ds_path))
|
||||||
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Unable to find prepared dataset in {prepared_ds_path}",
|
f"Unable to find prepared dataset in {prepared_ds_path}",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@@ -128,7 +128,6 @@ class DatasetValidationMixin:
|
|||||||
):
|
):
|
||||||
LOG.info(
|
LOG.info(
|
||||||
"explicitly setting `eval_sample_packing` to match `sample_packing`",
|
"explicitly setting `eval_sample_packing` to match `sample_packing`",
|
||||||
main_process_only=True,
|
|
||||||
)
|
)
|
||||||
data["eval_sample_packing"] = True
|
data["eval_sample_packing"] = True
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user