# SwanLab LoRA Training Example with Performance Profiling # # This example demonstrates standard LoRA fine-tuning with SwanLab integration # for performance profiling and optimization. # # Features enabled: # - SwanLab experiment tracking # - Performance profiling (training step, forward/backward pass timing) # - Real-time metrics visualization # # To run: # export SWANLAB_API_KEY=your-api-key # accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml # Model Configuration base_model: NousResearch/Llama-3.2-1B # Dataset Configuration datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca val_set_size: 0.1 output_dir: ./outputs/lora-swanlab-profiling-out # LoRA Configuration adapter: lora lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj # Training Configuration sequence_len: 2048 sample_packing: true eval_sample_packing: true micro_batch_size: 2 gradient_accumulation_steps: 2 num_epochs: 1 # Optimization optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 warmup_ratio: 0.1 weight_decay: 0.0 # Precision bf16: auto tf32: false # Performance gradient_checkpointing: true flash_attention: true # Checkpointing and Logging logging_steps: 1 evals_per_epoch: 4 saves_per_epoch: 1 # Loss Monitoring loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 special_tokens: pad_token: "<|end_of_text|>" # ============================================================================ # SwanLab Integration # ============================================================================ plugins: - axolotl.integrations.swanlab.SwanLabPlugin # Basic SwanLab Configuration use_swanlab: true swanlab_project: lora-profiling swanlab_experiment_name: llama-3.2-1b-profiling-demo swanlab_description: "LoRA fine-tuning with performance profiling" swanlab_mode: cloud # Options: cloud, local, offline, disabled # SwanLab Authentication # Recommended: Set via environment variable # export SWANLAB_API_KEY=your-api-key # Or set in config (less secure): # swanlab_api_key: your-api-key # Optional: Team workspace # swanlab_workspace: my-ml-team # ============================================================================ # Performance Profiling # ============================================================================ # # SwanLab automatically profiles trainer methods when enabled. # Profiling metrics appear in SwanLab dashboard under "profiling/" namespace. # # Built-in profiling: # - Minimal overhead (< 0.1% per step) # - High-precision timing (microsecond accuracy) # - Exception-safe (logs duration even if method fails) # # View profiling metrics in SwanLab dashboard: # profiling/Time taken: AxolotlTrainer.training_step # profiling/Time taken: AxolotlTrainer.compute_loss # profiling/Time taken: AxolotlTrainer.prediction_step # # For custom profiling in your own trainer, see: # examples/swanlab/custom_trainer_profiling.py # Completion logging is disabled for non-RLHF trainers swanlab_log_completions: false # Only works with DPO/KTO/ORPO/GRPO # ============================================================================ # Optional: Compare with Multiple Runs # ============================================================================ # # To compare profiling metrics across different configurations: # # 1. Run baseline without flash attention: # swanlab_experiment_name: llama-3.2-1b-no-flash-attn # flash_attention: false # # 2. Run with gradient checkpointing: # swanlab_experiment_name: llama-3.2-1b-grad-checkpoint # gradient_checkpointing: true # # 3. Run with both: # swanlab_experiment_name: llama-3.2-1b-optimized # flash_attention: true # gradient_checkpointing: true # # Then compare profiling metrics in SwanLab dashboard to see performance impact # ============================================================================ # Optional: Lark (Feishu) Team Notifications # ============================================================================ # # Get notified when profiling experiments complete: # swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx # swanlab_lark_secret: your-webhook-secret # ============================================================================ # Profiling Best Practices # ============================================================================ # # 1. Run multiple epochs to see profiling trends over time # 2. Ignore first ~10 steps (warmup period, slower) # 3. Look for outliers (steps that take significantly longer) # 4. Compare profiling metrics before/after optimization changes # 5. Monitor per-rank profiling in distributed training # # Common bottlenecks to profile: # - training_step: Overall step time (should be consistent) # - compute_loss: Loss computation (scales with sequence length) # - prediction_step: Evaluation time (can be slow for large val sets) # # If you see inconsistent timing: # - Check for data loading bottlenecks # - Monitor GPU utilization (may be CPU-bound) # - Check for gradient accumulation effects # - Verify CUDA kernel synchronization # ============================================================================ # Disable WandB if you're migrating from it # ============================================================================ # wandb_project: # use_wandb: false