make lisa training example work on one 24gb gpu

example config for lisa
fix LISA by ensuring params are not frozen during __init__
2024-04-02 03:19:54 +00:00 · 2024-04-01 07:27:16 +00:00 · 2024-04-01 06:57:28 +00:00 · 2024-04-01 04:54:03 +00:00 · 2024-03-31 00:27:04 -04:00 · 2024-03-30 22:55:15 -04:00
29 changed files with 987 additions and 78 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -16,17 +16,22 @@ jobs:
            cuda_version: 11.8.0
            python_version: "3.10"
            pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "121"
            cuda_version: 12.1.0
            python_version: "3.10"
            pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "121"
            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "121"
            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.2.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v3
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -28,7 +28,7 @@ jobs:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.1.2
+            pytorch: 2.2.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -63,7 +63,7 @@ jobs:
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}
-  build-axolotl-runpod:
+  build-axolotl-cloud:
    needs: build-axolotl
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    # this job needs to be run on self-hosted GPU runners...
@@ -84,7 +84,7 @@ jobs:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.1.2
+            pytorch: 2.2.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -113,7 +113,5 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
             winglian/axolotl-runpod:main-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
             ${{ (matrix.is_latest) && format('{0}-latest', 'winglian/axolotl-runpod:main') || '' }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -0,0 +1,118 @@
 name: docker-nightlies
 on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
 jobs:
  build-axolotl:
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
            pytorch: 2.1.2
            axolotl_extras:
            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
            is_latest: true
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
            pytorch: 2.1.2
            axolotl_extras:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.2.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: winglian/axolotl
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
      - name: Build and export to Docker
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
  build-axolotl-cloud:
    needs: build-axolotl
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
        include:
          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
            pytorch: 2.1.2
            axolotl_extras:
            is_latest: true
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
            pytorch: 2.1.2
            axolotl_extras:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.2.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: winglian/axolotl-cloud
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -25,7 +25,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip3 install wheel
+          pip3 install wheel packaging
          pip3 install -e .
          pip3 install -r requirements-tests.txt
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -48,6 +48,8 @@ jobs:
      - name: Install dependencies
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging
          pip3 install -U -e .
          pip3 install -r requirements-tests.txt
@@ -77,6 +79,11 @@ jobs:
            python_version: "3.10"
            pytorch: 2.1.2
            num_gpus: 1
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.2.1
            num_gpus: 1
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -22,6 +22,7 @@ RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,6 +20,7 @@ RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -21,7 +21,8 @@ lora_dropout: 0.05
 lora_target_linear: true
 sequence_len: 4096
-sample_packing: false
+sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 wandb_project:
--- a/examples/jamba/README.md
+++ b/examples/jamba/README.md
@@ -0,0 +1,10 @@
 # Jamba
 - ✅ qlora w/ deepspeed Zero-2 needs at least 2x GPUs and
  - 35GiB VRAM per GPU w minimal context length
  - 56GiB VRAM per GPU (w multipack enabled)
 - ✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?)
 - ✅ qlora single-gpu, ~51GiB VRAM
 - ✅ multipack
 - ❓ FSDP
 - ❓ 8-bit LoRA
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -0,0 +1,62 @@
 base_model: ai21labs/Jamba-v0.1
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.0
 output_dir: ./out
 sequence_len: 4096
 sample_packing: false
 pad_to_sequence_len: false
 eval_sample_packing: false
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 adapter: qlora
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 low_cpu_mem_usage: true
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 2
 optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 special_tokens:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -0,0 +1,62 @@
 base_model: ai21labs/Jamba-v0.1
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.0
 output_dir: ./out
 sequence_len: 4096
 sample_packing: false
 pad_to_sequence_len: false
 eval_sample_packing: false
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 adapter: qlora
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 low_cpu_mem_usage: true
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 2
 optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
 debug:
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 special_tokens:
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -0,0 +1,75 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 output_dir: ./lisa-out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 adapter:
 lora_model_dir:
 lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
 lora_fan_in_fan_out:
 lisa_n_layers: 2
 lisa_step_interval: 20
 lisa_layers_attribute: model.layers
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 5e-5 # recommendation from lisa paper for 7b
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
 warmup_steps: 100
 evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
--- a/examples/qwen/README.md
+++ b/examples/qwen/README.md
@@ -0,0 +1,10 @@
 # Qwen
 TODO
 # Qwen2 MoE
 ✅ multipack
 ✅ qwen2_moe 4-bit QLoRA
 ✅ qwen2_moe 16-bit LoRA
 ❓ qwen2_moe 8-bit LoRA
--- a/examples/qwen/qwen2-moe-lora.yaml
+++ b/examples/qwen/qwen2-moe-lora.yaml
@@ -0,0 +1,64 @@
 base_model: Qwen/Qwen1.5-MoE-A2.7B
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./out
 sequence_len: 1024  # supports up to 32k
 sample_packing: false
 pad_to_sequence_len: false
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 4
 optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/qwen/qwen2-moe-qlora.yaml
+++ b/examples/qwen/qwen2-moe-qlora.yaml
@@ -0,0 +1,64 @@
 base_model: Qwen/Qwen1.5-MoE-A2.7B
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./out
 sequence_len: 1024  # supports up to 32k
 sample_packing: false
 pad_to_sequence_len: false
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 4
 optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
-peft==0.9.0
+peft==0.10.0
-transformers @ git+https://github.com/huggingface/transformers.git@73a73b415e36f41481369f6129cb4b62bb127a78
+transformers @ git+https://github.com/huggingface/transformers.git@43d17c18360ac9c3d3491389328e2fe55fe8f9ce
 tokenizers==0.15.0
 bitsandbytes==0.43.0
 accelerate==0.28.0
@@ -32,11 +32,11 @@ fschat==0.2.36
 gradio==3.50.2
 tensorboard
-mamba-ssm==1.1.1
+mamba-ssm==1.2.0.post1
 # remote filesystems
 s3fs
 gcsfs
 # adlfs
-trl @ git+https://github.com/huggingface/trl.git@304e208f778a5442c30cdda500348226cdc97d90
+trl @ git+https://github.com/huggingface/trl.git@0ee349dcd43b0f4b3169449f16751c38ac4a609f
--- a/setup.py
+++ b/setup.py
@@ -78,7 +78,7 @@ setup(
            "deepspeed-kernels",
        ],
        "mamba-ssm": [
-            "mamba-ssm==1.0.1",
+            "mamba-ssm==1.2.0.post1",
        ],
        "auto-gptq": [
            "auto-gptq==0.5.1",
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -45,6 +45,7 @@ from axolotl.utils.callbacks import (
    causal_lm_bench_eval_callback_factory,
    log_prediction_callback_factory,
 )
 from axolotl.utils.callbacks.lisa import lisa_callback_factory
 from axolotl.utils.collators import (
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
@@ -200,6 +201,18 @@ class AxolotlTrainingArguments(TrainingArguments):
    orpo_alpha: Optional[float] = field(
        default=None,
    )
    lisa_n_layers: Optional[int] = field(
        default=None,
        metadata={"help": "the number of activate layers in LISA"},
    )
    lisa_step_interval: Optional[int] = field(
        default=None,
        metadata={"help": "how often to switch layers in LISA"},
    )
    lisa_layers_attribute: Optional[str] = field(
        default=None,
        metadata={"help": "path under the model to access the layers"},
    )
 class AxolotlTrainer(Trainer):
@@ -938,6 +951,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            )
            callbacks.append(early_stop_cb)
        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
            callbacks.append(lisa_callback_factory(trainer))
        return callbacks
    def _get_trainer_cls(self):
@@ -1229,6 +1244,15 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                    "relora_prune_ratio"
                ] = self.cfg.relora_prune_ratio
        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
            training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers
            training_arguments_kwargs[
                "lisa_step_interval"
            ] = self.cfg.lisa_step_interval
            training_arguments_kwargs[
                "lisa_layers_attribute"
            ] = self.cfg.lisa_layers_attribute
        training_arguments_kwargs = self.hook_pre_create_training_args(
            training_arguments_kwargs
        )
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -284,12 +284,7 @@ def flashattn_forward_with_s2attn(
    # [bsz, nh, q_len, hd]
    # pylint: disable=duplicate-code
-    kv_seq_len = key_states.shape[-2]
+    cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
    if past_key_value is not None:
        kv_seq_len += past_key_value[0].shape[-2]
    cos, sin = self.rotary_emb(
        value_states, seq_len=kv_seq_len, position_ids=position_ids
    )
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
@@ -435,13 +430,7 @@ def flashattn_forward(
    # [bsz, q_len, nh, hd]
    # [bsz, nh, q_len, hd]
-    kv_seq_len = key_states.shape[-2]
+    cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
    if past_key_value is not None:
        kv_seq_len += past_key_value[0].shape[-2]
    cos, sin = self.rotary_emb(
        value_states, seq_len=kv_seq_len, position_ids=position_ids
    )
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -80,11 +80,7 @@ def xformers_forward(
    # [bsz, q_len, nh, hd]
    # [bsz, nh, q_len, hd]
-    kv_seq_len = key_states.shape[-2]
+    cos, sin = self.rotary_emb(value_states)
    if past_key_value is not None:
        kv_seq_len += past_key_value[0].shape[-2]
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -12,6 +12,7 @@ from axolotl.monkeypatch.utils import get_unpad_data
 SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "mixtral",
    "qwen2",
    "qwen2_moe",
    "falcon",
    "phi",
    "gemma",
@@ -31,6 +32,10 @@ def patch_for_multipack(model_type, model_name=None):
        transformers.models.qwen2.modeling_qwen2._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
        )
    elif model_type == "qwen2_moe":
        transformers.models.qwen2_moe.modeling_qwen2_moe._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
        )
    elif model_type == "falcon":
        transformers.models.falcon.modeling_falcon._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
@@ -48,14 +53,16 @@ def patch_for_multipack(model_type, model_name=None):
            get_unpad_data
        )
    elif model_type == "gemmoe":
-        model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        patch_remote(model_name, ".configuration_gemmoe", ".modeling_gemmoe")
-        # we need to load the model here in order for modeling_gemmoe to be available
+    elif model_type == "jamba":
-        with init_empty_weights():
+        patch_remote(model_name, ".configuration_jamba", ".modeling_jamba")
-            AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
+
-        module_name = model_config.__class__.__module__.replace(
+
-            ".configuration_gemmoe", ".modeling_gemmoe"
+def patch_remote(model_name, config_name, modeling_name):
-        )
+    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-        modeling_gemmoe = importlib.import_module(module_name)
+    # we need to load the model here in order for modeling_* to be available
-        modeling_gemmoe._get_unpad_data = (  # pylint: disable=protected-access
+    with init_empty_weights():
-            get_unpad_data
+        AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
-        )
+    module_name = model_config.__class__.__module__.replace(config_name, modeling_name)
    modeling_arch = importlib.import_module(module_name)
    modeling_arch._get_unpad_data = get_unpad_data  # pylint: disable=protected-access
--- a/src/axolotl/utils/callbacks/lisa.py
+++ b/src/axolotl/utils/callbacks/lisa.py
@@ -0,0 +1,91 @@
 """
 module for LISA
 Adapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl
 Arxiv: https://arxiv.org/abs/2403.17919
 License: Apache 2.0
 """
 import logging
 from functools import reduce
 from typing import TYPE_CHECKING
 import numpy as np
 from transformers import TrainerCallback
 if TYPE_CHECKING:
    from axolotl.core.trainer_builder import AxolotlTrainer
 LOG = logging.getLogger("axolotl.callbacks.lisa")
 def lisa_callback_factory(trainer: "AxolotlTrainer"):
    class LISACallback(TrainerCallback):
        """trainer callback for lisa layer switching"""
        def __init__(
            self, n_layers, step_interval, trainer, layers_attribute="model.layers"
        ):
            super().__init__()
            self.n_layers = n_layers
            self.step_interval = step_interval
            self.layers_attribute = layers_attribute
            self.trainer = trainer
            reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
            self.total_layers = len(
                reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
            )
            self.active_layers_indices = []
            layers = reduce(
                getattr, self.layers_attribute.split("."), self.trainer.model
            )
            LOG.info(
                f"LISA will activate {self.n_layers}/{len(layers)} layers ({self.n_layers*100/len(layers)}%) every {self.step_interval} steps"
            )
        def freeze_all_layers(self):
            layers = reduce(
                getattr, self.layers_attribute.split("."), self.trainer.model
            )
            for layer in layers:
                for param in layer.parameters():
                    param.requires_grad = False
        def on_step_begin(
            self, args, state, control, **kwargs
        ):  # pylint: disable=unused-argument
            # Check if it's time to switch active layers, including at step 0
            if state.global_step % self.step_interval == 0 or state.global_step == 1:
                self.switch_active_layers()
        def switch_active_layers(self):
            # First, disable gradients for all layers
            self.freeze_all_layers()
            # Randomly select n_layers to activate
            layers = reduce(
                getattr, self.layers_attribute.split("."), self.trainer.model
            )
            self.active_layers_indices = np.random.choice(
                range(self.total_layers), self.n_layers, replace=False
            )
            LOG.info(
                f"Activating layers at indices: {self.active_layers_indices} for the next steps."
            )
            # Enable gradients only for the selected layers
            for idx in self.active_layers_indices:
                for param in layers[idx].parameters():
                    param.requires_grad = True
    lisa_callback = LISACallback(
        n_layers=trainer.args.lisa_n_layers,
        step_interval=trainer.args.lisa_step_interval,
        trainer=trainer,
        layers_attribute=trainer.args.lisa_layers_attribute,
    )
    return lisa_callback
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -208,11 +208,11 @@ def validate_config(cfg: DictDefault, capabilities: Optional[dict] = None):
            dict(
                AxolotlConfigWCapabilities(
                    **cfg.to_dict(), capabilities=capabilities
-                ).model_dump(exclude_unset=True)
+                ).model_dump(exclude_none=True)
            )
        )
    return DictDefault(
-        dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_unset=True))
+        dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_none=True))
    )
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -6,7 +6,7 @@ Module for pydantic models for configuration
 import logging
 import os
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 from pydantic import BaseModel, Field, conlist, field_validator, model_validator
 from transformers import SchedulerType
@@ -151,12 +151,6 @@ class PeftConfig(BaseModel):
    loftq_config: Optional[LoftQConfig] = None
 class AutoType(str, Enum):
    """auto type string configuration subset - used for bf16"""
    AUTO = "auto"
 class SpecialTokensConfig(BaseModel):
    """Special tokens configuration subset"""
@@ -185,7 +179,8 @@ class LoraConfig(BaseModel):
    peft_layers_to_transform: Optional[List[int]] = None
    peft: Optional[PeftConfig] = None
    peft_use_dora: Optional[bool] = None
-    peft_use_relora: Optional[bool] = None
+    peft_use_rslora: Optional[bool] = None
    peft_layer_replication: Optional[List[Tuple[int, int]]] = None
    lora_on_cpu: Optional[bool] = None
    gptq: Optional[bool] = None
@@ -307,12 +302,14 @@ class HyperparametersConfig(BaseModel):
        },
    )
-    train_on_inputs: Optional[bool] = None
+    train_on_inputs: Optional[bool] = False
    group_by_length: Optional[bool] = None
    learning_rate: Union[str, float]
-    weight_decay: Optional[float] = None
+    weight_decay: Optional[float] = 0.0
-    optimizer: Optional[Union[OptimizerNames, Literal["lion_pytorch"]]] = None
+    optimizer: Optional[
        Union[OptimizerNames, Literal["lion_pytorch"]]
    ] = OptimizerNames.ADAMW_HF.value
    optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
        default=None, metadata={"help": "Optional arguments to supply to optimizer."}
    )
@@ -323,7 +320,7 @@ class HyperparametersConfig(BaseModel):
        },
    )
    torchdistx_path: Optional[str] = None
-    lr_scheduler: Optional[SchedulerType] = None
+    lr_scheduler: Optional[SchedulerType] = "cosine"
    lr_scheduler_kwargs: Optional[Dict[str, Any]] = None
    lr_quadratic_warmup: Optional[bool] = None
    cosine_min_lr_ratio: Optional[float] = None
@@ -373,6 +370,23 @@ class MLFlowConfig(BaseModel):
    hf_mlflow_log_artifacts: Optional[bool] = None
 class LISAConfig(BaseModel):
    """LISA options"""
    lisa_n_layers: Optional[int] = Field(
        default=None,
        metadata={"help": "the number of activate layers in LISA"},
    )
    lisa_step_interval: Optional[int] = Field(
        default=None,
        metadata={"help": "how often to switch layers in LISA"},
    )
    lisa_layers_attribute: Optional[str] = Field(
        default="model.layers",
        metadata={"help": "path under the model to access the layers"},
    )
 class WandbConfig(BaseModel):
    """wandb configuration subset"""
@@ -407,6 +421,7 @@ class AxolotlInputConfig(
    HyperparametersConfig,
    WandbConfig,
    MLFlowConfig,
    LISAConfig,
    RemappedParameters,
    DeprecatedParameters,
    BaseModel,
@@ -473,7 +488,7 @@ class AxolotlInputConfig(
    loss_watchdog_threshold: Optional[float] = None
    loss_watchdog_patience: Optional[int] = None
-    bf16: Optional[Union[AutoType, bool]] = AutoType.AUTO
+    bf16: Optional[Union[Literal["auto"], bool]] = "auto"
    fp16: Optional[bool] = None
    bfloat16: Optional[bool] = None  # for non-AMP cases
    float16: Optional[bool] = None  # for non-AMP cases
@@ -487,7 +502,7 @@ class AxolotlInputConfig(
    unfrozen_parameters: Optional[List[str]] = None
-    sequence_len: int = Field(default=1024)
+    sequence_len: int = Field(default=512)
    sample_packing: Optional[bool] = None
    eval_sample_packing: Optional[bool] = None
    pad_to_sequence_len: Optional[bool] = None
@@ -536,6 +551,7 @@ class AxolotlInputConfig(
        Dict[Union[int, Literal["cpu", "disk"]], Union[int, str]]
    ] = None
    gpu_memory_limit: Optional[Union[int, str]] = None
    low_cpu_mem_usage: Optional[bool] = None
    chat_template: Optional[ChatTemplate] = None
    default_system_message: Optional[str] = None
@@ -548,10 +564,10 @@ class AxolotlInputConfig(
    sample_packing_eff_est: Optional[float] = None
    axolotl_config_path: Optional[str] = None
-    is_falcon_derived_model: Optional[bool] = Field(default=False)
+    is_falcon_derived_model: Optional[bool] = Field(default=None)
-    is_llama_derived_model: Optional[bool] = Field(default=False)
+    is_llama_derived_model: Optional[bool] = Field(default=None)
-    is_mistral_derived_model: Optional[bool] = Field(default=False)
+    is_mistral_derived_model: Optional[bool] = Field(default=None)
-    is_qwen_derived_model: Optional[bool] = Field(default=False)
+    is_qwen_derived_model: Optional[bool] = Field(default=None)
    @field_validator("datasets", mode="before")
    @classmethod
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -1,4 +1,5 @@
 """Module containing data utilities"""
 import functools
 import hashlib
 import logging
@@ -223,7 +224,7 @@ def load_tokenized_prepared_datasets(
                    token=use_auth_token,
                )
                ds_from_hub = True
-            except (FileNotFoundError, ConnectionError, HFValidationError):
+            except (FileNotFoundError, ConnectionError, HFValidationError, ValueError):
                pass
            ds_from_cloud = False
@@ -290,14 +291,17 @@ def load_tokenized_prepared_datasets(
            local_path = Path(config_dataset.path)
            if local_path.exists():
                if local_path.is_dir():
-                    # TODO dirs with arrow or parquet files could be loaded with `load_from_disk`
+                    if config_dataset.data_files:
-                    ds = load_dataset(
+                        ds_type = get_ds_type(config_dataset)
-                        config_dataset.path,
+                        ds = load_dataset(
-                        name=config_dataset.name,
+                            ds_type,
-                        data_files=config_dataset.data_files,
+                            name=config_dataset.name,
-                        streaming=False,
+                            data_files=config_dataset.data_files,
-                        split=None,
+                            streaming=False,
-                    )
+                            split=None,
                        )
                    else:
                        ds = load_from_disk(config_dataset.path)
                elif local_path.is_file():
                    ds_type = get_ds_type(config_dataset)
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -402,7 +402,9 @@ def load_model(
        from accelerate import infer_auto_device_map
        with init_empty_weights():
-            model_canvas = AutoModelForCausalLM.from_config(model_config)
+            model_canvas = AutoModelForCausalLM.from_config(
                model_config, trust_remote_code=cfg.trust_remote_code or False
            )
        model_canvas.tie_weights()
        device_map = infer_auto_device_map(
            model_canvas,
@@ -454,6 +456,10 @@ def load_model(
            "bnb_4bit_quant_type": "nf4",
            "bnb_4bit_quant_storage": torch.bfloat16,
        }
        if not cfg.deepspeed:
            # for some reason, this causes the loss to be off by an order of magnitude
            # but deepspeed needs this still in bfloat16
            bnb_config["bnb_4bit_quant_storage"] = torch.float32
        if cfg.bnb_config_kwargs:
            bnb_config.update(cfg.bnb_config_kwargs)
@@ -502,6 +508,9 @@ def load_model(
        model_kwargs["attn_implementation"] = "eager"
        model_config._attn_implementation = "eager"  # pylint: disable=protected-access
    if cfg.low_cpu_mem_usage:
        model_kwargs["low_cpu_mem_usage"] = True
    qlora_fsdp = cfg.fsdp and cfg.adapter == "qlora"
    try:
@@ -849,7 +858,9 @@ def load_lora(model, cfg, inference=False, config_only=False):
    if cfg.peft_use_dora:
        lora_config_kwargs["use_dora"] = cfg.peft_use_dora
    if cfg.peft_use_rslora:
-        lora_config_kwargs["use_rslora"] = cfg.use_rslora
+        lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
    if cfg.peft_layer_replication:
        lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
    lora_config = LoraConfig(
        r=cfg.lora_r,
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -11,6 +11,7 @@ import torch.cuda
 from accelerate.logging import get_logger
 from datasets import set_caching_enabled
 from torch.utils.data import DataLoader, RandomSampler
 from transformers.utils import is_torch_bf16_gpu_available
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFDPOTrainerBuilder
 from axolotl.utils.distributed import is_main_process, reduce_and_broadcast, zero_first
@@ -124,9 +125,10 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
                eval_dataset = eval_dataset.remove_columns("attention_mask")
        if cfg.model_config_type == "falcon":
-            LOG.info("dropping token_type_ids column")
+            LOG.info("dropping token_type_ids column if it exists")
-            train_dataset = train_dataset.remove_columns("token_type_ids")
+            if "token_type_ids" in train_dataset.column_names:
-            if eval_dataset:
+                train_dataset = train_dataset.remove_columns("token_type_ids")
            if eval_dataset and "token_type_ids" in eval_dataset.column_names:
                eval_dataset = eval_dataset.remove_columns("token_type_ids")
        train_dataset = train_dataset.filter(
@@ -310,6 +312,8 @@ def setup_fsdp_envs(cfg):
        os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
    if cfg.fsdp_config.fsdp_state_dict_type:
        os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
    if cfg.fsdp_config.fsdp_auto_wrap_policy:
        os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.fsdp_auto_wrap_policy
    if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap:
        os.environ[
            "FSDP_TRANSFORMER_CLS_TO_WRAP"
@@ -323,6 +327,11 @@ def prepare_optim_env(cfg):
        os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
        os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
    if (cfg.bf16 == "auto" and is_torch_bf16_gpu_available()) or cfg.bf16 is True:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
    elif cfg.fp16:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"
 def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
    if cfg.rl in ["dpo", "ipo", "kto_pair"]:
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -0,0 +1,272 @@
 """
 Test dataset loading under various conditions.
 """
 import shutil
 import tempfile
 import unittest
 from pathlib import Path
 from datasets import Dataset
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 from axolotl.utils.data import load_tokenized_prepared_datasets
 from axolotl.utils.dict import DictDefault
 class TestDatasetPreparation(unittest.TestCase):
    """Test a configured dataloader."""
    def setUp(self) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
        self.tokenizer.add_special_tokens(
            {
                "bos_token": "<s>",
                "eos_token": "</s>",
                "unk_token": "<unk>",
            }
        )
        # Alpaca dataset.
        self.dataset = Dataset.from_list(
            [
                {
                    "instruction": "Evaluate this sentence for spelling and grammar mistakes",
                    "input": "He finnished his meal and left the resturant",
                    "output": "He finished his meal and left the restaurant.",
                }
            ]
        )
    def test_load_hub(self):
        """Core use case.  Verify that processing data from the hub works"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            prepared_path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 1024,
                    "datasets": [
                        {
                            "path": "mhenrichsen/alpaca_2k_test",
                            "type": "alpaca",
                        },
                    ],
                }
            )
            dataset, _ = load_tokenized_prepared_datasets(
                self.tokenizer, cfg, prepared_path
            )
            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
    def test_load_local_hub(self):
        """Niche use case.  Verify that a local copy of a hub dataset can be loaded"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path("mhenrichsen/alpaca_2k_test")
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
            snapshot_download(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
            )
            prepared_path = Path(tmp_dir) / "prepared"
            # Right now a local copy that doesn't fully conform to a dataset
            # must list data_files and ds_type otherwise the loader won't know
            # how to load it.
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 1024,
                    "datasets": [
                        {
                            "path": "mhenrichsen/alpaca_2k_test",
                            "ds_type": "parquet",
                            "type": "alpaca",
                            "data_files": [
                                "mhenrichsen/alpaca_2k_test/alpaca_2000.parquet",
                            ],
                        },
                    ],
                }
            )
            dataset, _ = load_tokenized_prepared_datasets(
                self.tokenizer, cfg, prepared_path
            )
            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
            shutil.rmtree(tmp_ds_path)
    def test_load_from_save_to_disk(self):
        """Usual use case.  Verify datasets saved via `save_to_disk` can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_name = Path(tmp_dir) / "tmp_dataset"
            self.dataset.save_to_disk(tmp_ds_name)
            prepared_path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 256,
                    "datasets": [
                        {
                            "path": str(tmp_ds_name),
                            "type": "alpaca",
                        },
                    ],
                }
            )
            dataset, _ = load_tokenized_prepared_datasets(
                self.tokenizer, cfg, prepared_path
            )
            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
    def test_load_from_dir_of_parquet(self):
        """Usual use case.  Verify a directory of parquet files can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
            tmp_ds_dir.mkdir()
            tmp_ds_path = tmp_ds_dir / "shard1.parquet"
            self.dataset.to_parquet(tmp_ds_path)
            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 256,
                    "datasets": [
                        {
                            "path": str(tmp_ds_dir),
                            "ds_type": "parquet",
                            "name": "test_data",
                            "data_files": [
                                str(tmp_ds_path),
                            ],
                            "type": "alpaca",
                        },
                    ],
                }
            )
            dataset, _ = load_tokenized_prepared_datasets(
                self.tokenizer, cfg, prepared_path
            )
            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
    def test_load_from_dir_of_json(self):
        """Standard use case.  Verify a directory of json files can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
            tmp_ds_dir.mkdir()
            tmp_ds_path = tmp_ds_dir / "shard1.json"
            self.dataset.to_json(tmp_ds_path)
            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 256,
                    "datasets": [
                        {
                            "path": str(tmp_ds_dir),
                            "ds_type": "json",
                            "name": "test_data",
                            "data_files": [
                                str(tmp_ds_path),
                            ],
                            "type": "alpaca",
                        },
                    ],
                }
            )
            dataset, _ = load_tokenized_prepared_datasets(
                self.tokenizer, cfg, prepared_path
            )
            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
    def test_load_from_single_parquet(self):
        """Standard use case.  Verify a single parquet file can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.parquet"
            self.dataset.to_parquet(tmp_ds_path)
            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 256,
                    "datasets": [
                        {
                            "path": str(tmp_ds_path),
                            "name": "test_data",
                            "type": "alpaca",
                        },
                    ],
                }
            )
            dataset, _ = load_tokenized_prepared_datasets(
                self.tokenizer, cfg, prepared_path
            )
            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
    def test_load_from_single_json(self):
        """Standard use case.  Verify a single json file can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.json"
            self.dataset.to_json(tmp_ds_path)
            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 256,
                    "datasets": [
                        {
                            "path": str(tmp_ds_path),
                            "name": "test_data",
                            "type": "alpaca",
                        },
                    ],
                }
            )
            dataset, _ = load_tokenized_prepared_datasets(
                self.tokenizer, cfg, prepared_path
            )
            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -54,6 +54,18 @@ class TestValidation(BaseValidation):
    Test the validation module
    """
    def test_defaults(self, minimal_cfg):
        test_cfg = DictDefault(
            {
                "weight_decay": None,
            }
            | minimal_cfg
        )
        cfg = validate_config(test_cfg)
        assert cfg.train_on_inputs is False
        assert cfg.weight_decay is None
    def test_datasets_min_length(self):
        cfg = DictDefault(
            {
Author	SHA1	Message	Date
Aman Karmani	dfe591435f	make lisa training example work on one 24gb gpu	2024-04-02 03:19:54 +00:00
Aman Karmani	5dd9364c00	example config for lisa	2024-04-01 07:27:16 +00:00
Aman Karmani	6185cd5227	fix LISA by ensuring params are not frozen during __init__	2024-04-01 06:57:28 +00:00
Aman Karmani	b357c93f23	improve lisa callback logging	2024-04-01 04:54:03 +00:00
Wing Lian	21a5094226	fix default and fix attribute traversal for layers	2024-03-31 00:27:04 -04:00
Wing Lian	3a9ad7c66e	add lisa support	2024-03-30 22:55:15 -04:00
Wing Lian	89134f2143	make sure to install causal_conv1d in docker (#1459 )	2024-03-29 16:43:25 -04:00
Wing Lian	6086be85f7	qwen2_moe support w multipack (#1455 )	2024-03-29 11:04:53 -04:00
Wing Lian	4a92a3b9ee	Nightlies fix v4 (#1458 ) [skip ci] * another attempt at github actions * try again	2024-03-29 11:04:34 -04:00
Wing Lian	46a73e3d1a	fix yaml parsing for workflow (#1457 ) [skip ci]	2024-03-29 10:21:08 -04:00
Wing Lian	da3415bb5a	fix how nightly tag is generated (#1456 ) [skip ci]	2024-03-29 09:29:17 -04:00
Wing Lian	8cb127abeb	configure nightly docker builds (#1454 ) [skip ci] * configure nightly docker builds * also test update pytorch in modal ci	2024-03-29 08:25:45 -04:00
Wing Lian	05b398a072	fix some of the edge cases for Jamba (#1452 ) * fix some of the edge cases for Jamba * update requirements for jamba	2024-03-29 02:38:02 -04:00
Keith Stevens	e634118f90	Support loading datasets saved via save_to_disk (#1432 ) * Support loading datasetes saved via save_to_disk * Adding comprehensive unittests * Fix dataset tests due to new hash changes	2024-03-29 00:19:36 -04:00
Wing Lian	02af0820f7	Jamba (#1451 ) * fixes for larger models * add qlora example for deepspeed * add readme for jamba	2024-03-28 21:03:22 -04:00
Wing Lian	4155e9988f	fix layer_replication arg to peft (#1446 )	2024-03-27 10:18:56 -04:00
Wing Lian	25afd35842	support layer replication for peft and fix rslora integration (#1445 )	2024-03-27 10:16:47 -04:00
Wing Lian	da265dd796	fix for accelerate env var for auto bf16, add new base image and expand torch_cuda_arch_list support (#1413 )	2024-03-26 16:46:19 -04:00
WenboPan	e07347b188	Remove seq_len arg in rotary_emb (#1443 ) * remove seq_len in llama rotary_emb * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-03-26 15:19:44 -04:00
Far El	bcdc9b1601	Fix falcon tokenization step (#1441 ) [skip ci] * Fix falcon tokenization step * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-03-26 15:19:34 -04:00
Satpal Singh Rathore	c19d060a74	turn sample_packing on for training (#1438 ) [skip ci]	2024-03-26 15:19:04 -04:00
Wing Lian	601b77bc9d	make sure to capture non-null defaults from config validation (#1415 )	2024-03-26 15:18:47 -04:00