From 4d1553e53f31af952928fd1ec68f17c6a7252276 Mon Sep 17 00:00:00 2001 From: Dan Saunders Date: Mon, 27 Jan 2025 15:43:51 -0500 Subject: [PATCH] updates --- _quarto.yml | 72 ++++++++++++++++------------------ _sidebar.yml | 17 ++++++++ api/ConstantLengthDataset.qmd | 11 ++++++ api/TokenizedPromptDataset.qmd | 19 +++++++++ api/choose_config.qmd | 28 +++++++++++++ api/index.qmd | 5 +++ api/load_cfg.qmd | 21 ++++++++++ api/validate_config.qmd | 5 +++ objects.json | 1 + reference/index.qmd | 3 ++ src/axolotl/__init__.py | 16 +++++++- 11 files changed, 159 insertions(+), 39 deletions(-) create mode 100644 _sidebar.yml create mode 100644 api/ConstantLengthDataset.qmd create mode 100644 api/TokenizedPromptDataset.qmd create mode 100644 api/choose_config.qmd create mode 100644 api/index.qmd create mode 100644 api/load_cfg.qmd create mode 100644 api/validate_config.qmd create mode 100644 objects.json create mode 100644 reference/index.qmd diff --git a/_quarto.yml b/_quarto.yml index a530bf73d..2f30abbdd 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -1,9 +1,6 @@ project: type: website -metadata-files: - - _sidebar.yml - website: title: "Axolotl" description: "Fine-tuning" @@ -22,31 +19,33 @@ website: href: https://discord.gg/7m9sfhzaf3 sidebar: - pinned: true - collapse-level: 2 - style: docked - contents: - - text: Home - href: index.qmd - - section: "How-To Guides" - contents: - # TODO Edit folder structure after we have more docs. - - docs/debugging.qmd - - docs/multipack.qmd - - docs/fsdp_qlora.qmd - - docs/input_output.qmd - - docs/rlhf.qmd - - docs/nccl.qmd - - docs/mac.qmd - - docs/multi-node.qmd - - docs/unsloth.qmd - - docs/amd_hpc.qmd - - section: "Dataset Formats" - contents: docs/dataset-formats/* - - section: "Reference" - contents: - - docs/config.qmd - - docs/faq.qmd + pinned: true + collapse-level: 2 + style: docked + contents: + - text: Home + href: index.qmd + - section: "How-To Guides" + contents: + - docs/debugging.qmd + - docs/multipack.qmd + - docs/fsdp_qlora.qmd + - docs/input_output.qmd + - docs/rlhf.qmd + - docs/nccl.qmd + - docs/mac.qmd + - docs/multi-node.qmd + - docs/unsloth.qmd + - docs/amd_hpc.qmd + - section: "Dataset Formats" + contents: docs/dataset-formats/* + - section: "Reference" + contents: + - docs/config.qmd + - section: "API Reference" + contents: "{{ api_contents }}" + - text: "FAQ" + href: docs/faq.qmd format: html: @@ -56,14 +55,11 @@ format: quartodoc: package: axolotl - - sidebar: _sidebar.yml - + parser: google + dir: api sections: - - title: Some functions - desc: Functions to inspect docstrings. - contents: - # the functions being documented in the package. - # you can refer to anything: class methods, modules, etc.. - - get_object - - preview + - title: Core API + desc: Core functionality of Axolotl + +metadata-files: + - api/_sidebar.yml diff --git a/_sidebar.yml b/_sidebar.yml new file mode 100644 index 000000000..1a01a4e08 --- /dev/null +++ b/_sidebar.yml @@ -0,0 +1,17 @@ +website: + sidebar: + - collapse-level: 2 + contents: + - href: introduction.qmd + text: Introduction + - contents: + - reference/index.qmd + - contents: [] + section: axolotl + section: Reference + - href: basics-summary.qmd + text: Basics + id: reference + search: true + style: docked + - id: dummy-sidebar diff --git a/api/ConstantLengthDataset.qmd b/api/ConstantLengthDataset.qmd new file mode 100644 index 000000000..0998d2f14 --- /dev/null +++ b/api/ConstantLengthDataset.qmd @@ -0,0 +1,11 @@ +# ConstantLengthDataset { #axolotl.ConstantLengthDataset } + +```python +ConstantLengthDataset(self, tokenizer, datasets, seq_length=2048) +``` + +Iterable dataset that returns constant length chunks of tokens from stream of text files. + Args: + tokenizer (Tokenizer): The processor used for processing the data. + dataset (dataset.Dataset): Dataset with text files. + seq_length (int): Length of token sequences to return. diff --git a/api/TokenizedPromptDataset.qmd b/api/TokenizedPromptDataset.qmd new file mode 100644 index 000000000..a941375de --- /dev/null +++ b/api/TokenizedPromptDataset.qmd @@ -0,0 +1,19 @@ +# TokenizedPromptDataset { #axolotl.TokenizedPromptDataset } + +```python +TokenizedPromptDataset( + self, + prompt_tokenizer, + dataset, + process_count=None, + keep_in_memory=False, + **kwargs, +) +``` + +Dataset that returns tokenized prompts from a stream of text files. + Args: + prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data. + dataset (dataset.Dataset): Dataset with text files. + process_count (int): Number of processes to use for tokenizing. + keep_in_memory (bool): Whether to keep the tokenized dataset in memory. diff --git a/api/choose_config.qmd b/api/choose_config.qmd new file mode 100644 index 000000000..9ec9fcfa3 --- /dev/null +++ b/api/choose_config.qmd @@ -0,0 +1,28 @@ +# choose_config { #axolotl.choose_config } + +```python +choose_config(path) +``` + +Helper method for choosing a `axolotl` config YAML file (considering only files +ending with `.yml` or `.yaml`). If more than one config file exists in the passed +`path`, the user is prompted to choose one. + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|--------|--------|-----------------------------------------------|------------| +| path | Path | Directory in which config file(s) are stored. | _required_ | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|--------|----------------------------------------------------------------------------------| +| | str | Path to either (1) the sole YAML file, or (2) if more than one YAML files exist, | +| | str | the user-selected YAML file. | + +## Raises {.doc-section .doc-section-raises} + +| Name | Type | Description | +|--------|------------|-------------------------------------------------| +| | ValueError | If no YAML files are found in the given `path`. | diff --git a/api/index.qmd b/api/index.qmd new file mode 100644 index 000000000..e0847c27c --- /dev/null +++ b/api/index.qmd @@ -0,0 +1,5 @@ +# Function reference {.doc .doc-index} + +## Core API + +Core functionality of Axolotl diff --git a/api/load_cfg.qmd b/api/load_cfg.qmd new file mode 100644 index 000000000..280131635 --- /dev/null +++ b/api/load_cfg.qmd @@ -0,0 +1,21 @@ +# load_cfg { #axolotl.load_cfg } + +```python +load_cfg(config=Path('examples/'), **kwargs) +``` + +Loads the `axolotl` configuration stored at `config`, validates it, and performs +various setup. + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|--------|--------------------|--------------------------------------------------------------|---------------------| +| config | Union\[str, Path\] | Path (local or remote) to `axolotl` config YAML file. | `Path('examples/')` | +| kwargs | | Additional keyword arguments to override config file values. | `{}` | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|-------------|-----------------------------------------------------| +| | DictDefault | `DictDefault` mapping configuration keys to values. | diff --git a/api/validate_config.qmd b/api/validate_config.qmd new file mode 100644 index 000000000..4bfbd8406 --- /dev/null +++ b/api/validate_config.qmd @@ -0,0 +1,5 @@ +# validate_config { #axolotl.validate_config } + +```python +validate_config(cfg, capabilities=None, env_capabilities=None) +``` diff --git a/objects.json b/objects.json new file mode 100644 index 000000000..b2044c990 --- /dev/null +++ b/objects.json @@ -0,0 +1 @@ +{"project": "axolotl", "version": "0.0.9999", "count": 0, "items": []} diff --git a/reference/index.qmd b/reference/index.qmd new file mode 100644 index 000000000..6fe79d073 --- /dev/null +++ b/reference/index.qmd @@ -0,0 +1,3 @@ +# API Reference {.doc .doc-index} + +## Core API diff --git a/src/axolotl/__init__.py b/src/axolotl/__init__.py index 8b0ba0532..e42ce153e 100644 --- a/src/axolotl/__init__.py +++ b/src/axolotl/__init__.py @@ -2,6 +2,20 @@ import pkgutil -__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package +from .cli.config import choose_config, load_cfg, validate_config +from .datasets import ConstantLengthDataset, TokenizedPromptDataset +from .evaluate import evaluate +from .train import train +__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package __version__ = "0.6.0" + +__all__ = [ + "train", + "evaluate", + "TokenizedPromptDataset", + "ConstantLengthDataset", + "load_cfg", + "choose_config", + "validate_config", +]