From 7acf93b59f380408f28bf824d5798f42d4f3839b Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Tue, 1 Apr 2025 02:47:28 +0700
Subject: [PATCH] Fix(doc): Clarify doc on attention configs and missing
 pad_token (#2455) [skip ci]

* fix: clarify input type

* fix: handling of error message if data_files not available

* fix: clarify attention handling

* fix: add doc on missing pad token
---
 docs/config.qmd                  | 27 ++++++++++++++++-----------
 docs/faq.qmd                     | 12 +++++++++++-
 src/axolotl/utils/data/shared.py |  3 ++-
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/docs/config.qmd b/docs/config.qmd
index b0c8616a2..208d1b739 100644
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -587,26 +587,31 @@ max_grad_norm:
 # currently only supported on Llama and Mistral
 neftune_noise_alpha:
 
-# Whether to bettertransformers
+# Optional[bool]. Whether to bettertransformers
 flash_optimum:
-# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
+
+# Note: Only one of the following attention patches can be used at a time.
+# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.
+
+# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 xformers_attention:
-# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
 flash_attention:
-flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
-flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
-flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
-flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
-# Whether to use scaled-dot-product attention
+flash_attn_cross_entropy:  # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only
+flash_attn_rms_norm:  # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only
+flash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation
+flash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation
+# Optional[bool]. Whether to use scaled-dot-product attention
 # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 sdp_attention:
-# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
 s2_attention:
+
 # Optional[bool]. Whether to use low_cpu_mem_usage
 low_cpu_mem_usage:
-# Resume from a specific checkpoint dir
+# Optional[str]. Resume from a specific checkpoint dir
 resume_from_checkpoint:
-# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.
 # Be careful with this being turned on between different models.
 auto_resume_from_checkpoints: false
 
diff --git a/docs/faq.qmd b/docs/faq.qmd
index 1ce14681a..664359cb8 100644
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -35,12 +35,22 @@ description: Frequently asked questions
 
 **Q: How to call Axolotl via custom python scripts?**
 
-> A: Yes, since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
+> A: Since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
 
 **Q: How to know the value to use for `fsdp_transformer_layer_cls_to_wrap`?**
 
 > A: This is the class name of the transformer layer to wrap with FSDP. For example, for `LlamaForCausalLM`, the value is `LlamaDecoderLayer`. To find this for a specific model, check the model's `PreTrainedModel` definition and look for `_no_split_modules` variable in the `modeling_<model_name>.py` file within `transformers` library.
 
+**Q: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token**
+
+> A: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:
+
+> ```yaml
+> special_tokens:
+>   # str. If you're not sure, set to same as `eos_token`.
+>   pad_token: "..."
+> ```
+
 ### Chat templates
 
 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py
index 8b3a7541a..1bb83efd5 100644
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -238,7 +238,8 @@ def load_dataset_w_config(
             trust_remote_code=config_dataset.trust_remote_code,
             **load_ds_kwargs,
         )
-    else:
+    elif config_dataset.data_files:
+        fp: str | list[str] | None = None
         if isinstance(config_dataset.data_files, str):
             fp = hf_hub_download(
                 repo_id=config_dataset.path,