Compare commits

..

1 Commits

Author SHA1 Message Date
Wing Lian
1cfb8feb2d add iterable argument to preprocess-cli 2025-01-27 14:31:12 -05:00
13 changed files with 33 additions and 165 deletions

View File

@@ -19,47 +19,35 @@ website:
href: https://discord.gg/7m9sfhzaf3
sidebar:
pinned: true
collapse-level: 2
style: docked
contents:
- text: Home
href: index.qmd
- section: "How-To Guides"
contents:
- docs/debugging.qmd
- docs/multipack.qmd
- docs/fsdp_qlora.qmd
- docs/input_output.qmd
- docs/rlhf.qmd
- docs/nccl.qmd
- docs/mac.qmd
- docs/multi-node.qmd
- docs/unsloth.qmd
- docs/amd_hpc.qmd
- section: "Dataset Formats"
contents: docs/dataset-formats/*
- section: "Reference"
contents:
- docs/config.qmd
- section: "API Reference"
contents: "{{ api_contents }}"
- text: "FAQ"
href: docs/faq.qmd
pinned: true
collapse-level: 2
style: docked
contents:
- text: Home
href: index.qmd
- section: "How-To Guides"
contents:
# TODO Edit folder structure after we have more docs.
- docs/debugging.qmd
- docs/multipack.qmd
- docs/fsdp_qlora.qmd
- docs/input_output.qmd
- docs/rlhf.qmd
- docs/nccl.qmd
- docs/mac.qmd
- docs/multi-node.qmd
- docs/unsloth.qmd
- docs/amd_hpc.qmd
- section: "Dataset Formats"
contents: docs/dataset-formats/*
- section: "Reference"
contents:
- docs/config.qmd
- docs/faq.qmd
format:
html:
theme: materia
css: styles.css
toc: true
quartodoc:
package: axolotl
parser: google
dir: api
sections:
- title: Core API
desc: Core functionality of Axolotl
metadata-files:
- api/_sidebar.yml

View File

@@ -1,17 +0,0 @@
website:
sidebar:
- collapse-level: 2
contents:
- href: introduction.qmd
text: Introduction
- contents:
- reference/index.qmd
- contents: []
section: axolotl
section: Reference
- href: basics-summary.qmd
text: Basics
id: reference
search: true
style: docked
- id: dummy-sidebar

View File

@@ -1,11 +0,0 @@
# ConstantLengthDataset { #axolotl.ConstantLengthDataset }
```python
ConstantLengthDataset(self, tokenizer, datasets, seq_length=2048)
```
Iterable dataset that returns constant length chunks of tokens from stream of text files.
Args:
tokenizer (Tokenizer): The processor used for processing the data.
dataset (dataset.Dataset): Dataset with text files.
seq_length (int): Length of token sequences to return.

View File

@@ -1,19 +0,0 @@
# TokenizedPromptDataset { #axolotl.TokenizedPromptDataset }
```python
TokenizedPromptDataset(
self,
prompt_tokenizer,
dataset,
process_count=None,
keep_in_memory=False,
**kwargs,
)
```
Dataset that returns tokenized prompts from a stream of text files.
Args:
prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data.
dataset (dataset.Dataset): Dataset with text files.
process_count (int): Number of processes to use for tokenizing.
keep_in_memory (bool): Whether to keep the tokenized dataset in memory.

View File

@@ -1,28 +0,0 @@
# choose_config { #axolotl.choose_config }
```python
choose_config(path)
```
Helper method for choosing a `axolotl` config YAML file (considering only files
ending with `.yml` or `.yaml`). If more than one config file exists in the passed
`path`, the user is prompted to choose one.
## Parameters {.doc-section .doc-section-parameters}
| Name | Type | Description | Default |
|--------|--------|-----------------------------------------------|------------|
| path | Path | Directory in which config file(s) are stored. | _required_ |
## Returns {.doc-section .doc-section-returns}
| Name | Type | Description |
|--------|--------|----------------------------------------------------------------------------------|
| | str | Path to either (1) the sole YAML file, or (2) if more than one YAML files exist, |
| | str | the user-selected YAML file. |
## Raises {.doc-section .doc-section-raises}
| Name | Type | Description |
|--------|------------|-------------------------------------------------|
| | ValueError | If no YAML files are found in the given `path`. |

View File

@@ -1,5 +0,0 @@
# Function reference {.doc .doc-index}
## Core API
Core functionality of Axolotl

View File

@@ -1,21 +0,0 @@
# load_cfg { #axolotl.load_cfg }
```python
load_cfg(config=Path('examples/'), **kwargs)
```
Loads the `axolotl` configuration stored at `config`, validates it, and performs
various setup.
## Parameters {.doc-section .doc-section-parameters}
| Name | Type | Description | Default |
|--------|--------------------|--------------------------------------------------------------|---------------------|
| config | Union\[str, Path\] | Path (local or remote) to `axolotl` config YAML file. | `Path('examples/')` |
| kwargs | | Additional keyword arguments to override config file values. | `{}` |
## Returns {.doc-section .doc-section-returns}
| Name | Type | Description |
|--------|-------------|-----------------------------------------------------|
| | DictDefault | `DictDefault` mapping configuration keys to values. |

View File

@@ -1,5 +0,0 @@
# validate_config { #axolotl.validate_config }
```python
validate_config(cfg, capabilities=None, env_capabilities=None)
```

View File

@@ -1 +0,0 @@
{"project": "axolotl", "version": "0.0.9999", "count": 0, "items": []}

View File

@@ -1,3 +0,0 @@
# API Reference {.doc .doc-index}
## Core API

View File

@@ -2,5 +2,3 @@ pre-commit
black
mypy
types-requests
quartodoc
quarto-cli

View File

@@ -2,20 +2,6 @@
import pkgutil
from .cli.config import choose_config, load_cfg, validate_config
from .datasets import ConstantLengthDataset, TokenizedPromptDataset
from .evaluate import evaluate
from .train import train
__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package
__version__ = "0.6.0"
__all__ = [
"train",
"evaluate",
"TokenizedPromptDataset",
"ConstantLengthDataset",
"load_cfg",
"choose_config",
"validate_config",
]
__version__ = "0.6.0"

View File

@@ -13,6 +13,12 @@ class PreprocessCliArgs:
debug_num_examples: int = field(default=1)
prompter: Optional[str] = field(default=None)
download: Optional[bool] = field(default=True)
iterable: Optional[bool] = field(
default=None,
metadata={
"help": "Use IterableDataset for streaming processing of large datasets"
},
)
@dataclass