move unmaintained examples to archive (#2903) [skip ci]

2025-07-12 11:39:51 -04:00
parent d6e4a611e5
commit fb7bc9250d
50 changed files with 5 additions and 0 deletions
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
@@ -0,0 +1,74 @@
+base_model: LnL-AI/dbrx-base-converted-v2
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+trust_remote_code: true
+
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 512
+sample_packing: false
+pad_to_sequence_len: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+adapter: lora
+lora_model_dir:
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+# w1, w2, & v1 will hang the trainer
+lora_target_modules:
+  - q_proj # attn
+  - k_proj # attn
+  - v_proj # attn
+  - out_proj # attn
+  - layer # router
+#  - w1
+#  - w2
+#  - v1
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: false  # don't use with fsdp_activation_checkpointing
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch:
+saves_per_epoch: 1
+
+weight_decay: 0.0
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: false
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: DbrxBlock
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_activation_checkpointing: true
--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
@@ -0,0 +1,77 @@
+base_model: LnL-AI/dbrx-base-converted-v2
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+trust_remote_code: true
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 512
+sample_packing: false
+pad_to_sequence_len: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+adapter: lora
+lora_model_dir:
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+# w1, w2, & v1 will hang the trainer
+lora_target_modules:
+  - q_proj # attn
+  - k_proj # attn
+  - v_proj # attn
+  - out_proj # attn
+  - layer # router
+#  - w1
+#  - w2
+#  - v1
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: false  # don't use with fsdp_activation_checkpointing
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch:
+saves_per_epoch: 1
+
+weight_decay: 0.0
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: false
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: DbrxBlock
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_activation_checkpointing: true
--- a/examples/archived/dbrx/README.md
+++ b/examples/archived/dbrx/README.md
@@ -0,0 +1,26 @@
+# DBRX MoE
+
+Currently, for LoRA, only the `q_proj`, `k_proj`, `v_proj` `out_proj` and `layer` Linear layers are trainable.
+
+We are using the "converted" base models based on [this issue](https://huggingface.co/databricks/dbrx-instruct/discussions/10)
+where the Experts are fused as an `nn.Parameter` rather than a `nn.Linear` layer. However, the implementation
+is still a bit buggy and attempting to train a LoRA adapter over those `w1`, `w2` and `v1` layers
+results in the trainer hanging.
+
+
+### FSDP
+We've tested using the [`LnL-AI/dbrx-base-converted-v2`](https://huggingface.co/LnL-AI/dbrx-base-converted-v2) model as the base model for FSDP.
+
+The high memory usage seen w/ FSDP is due to FSDP not supporting 8bit optimizers.
+
+- 16-bit LoRA w/ FSDP
+  - ✅ w/o CPU Offload - 8x80GB uses ~80GiB/gpu
+  - ❌ w/ CPU Offload - `paged_adamw_8bit` optimizer errors from being on cpu
+- ✅ 8-bit LoRA w/ FSDP
+- ❌ 4-bit QLoRA w/ FSDP - errors w/: `Error an illegal memory access was encountered at line 90 in file /src/csrc/ops.cu`
+- ✅ bf16 full finetune w/ FSDP, freezing all but first 8 layers (8x80GB uses ~78GiB/gpu)
+
+
+### Deepspeed
+
+WIP
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
@@ -0,0 +1,49 @@
+base_model: LnL-AI/dbrx-base-converted-v2
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+trust_remote_code: true
+
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 512
+sample_packing: false
+pad_to_sequence_len: false
+
+unfrozen_parameters:
+  - transformer.blocks.[0-7].
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch:
+saves_per_epoch: 1
+
+weight_decay: 0.0
+deepspeed: deepspeed_configs/zero3_bf16.json