add iterable argument to preprocess-cli

2025-01-27 14:31:12 -05:00
13 changed files with 33 additions and 165 deletions
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -19,47 +19,35 @@ website:
      href: https://discord.gg/7m9sfhzaf3

  sidebar:
-    pinned: true
-    collapse-level: 2
-    style: docked
-    contents:
-      - text: Home
-        href: index.qmd
-      - section: "How-To Guides"
-        contents:
-          - docs/debugging.qmd
-          - docs/multipack.qmd
-          - docs/fsdp_qlora.qmd
-          - docs/input_output.qmd
-          - docs/rlhf.qmd
-          - docs/nccl.qmd
-          - docs/mac.qmd
-          - docs/multi-node.qmd
-          - docs/unsloth.qmd
-          - docs/amd_hpc.qmd
-      - section: "Dataset Formats"
-        contents: docs/dataset-formats/*
-      - section: "Reference"
-        contents:
-          - docs/config.qmd
-      - section: "API Reference"
-        contents: "{{ api_contents }}"
-      - text: "FAQ"
-        href: docs/faq.qmd
+      pinned: true
+      collapse-level: 2
+      style: docked
+      contents:
+        - text: Home
+          href: index.qmd
+        - section: "How-To Guides"
+          contents:
+          # TODO Edit folder structure after we have more docs.
+            - docs/debugging.qmd
+            - docs/multipack.qmd
+            - docs/fsdp_qlora.qmd
+            - docs/input_output.qmd
+            - docs/rlhf.qmd
+            - docs/nccl.qmd
+            - docs/mac.qmd
+            - docs/multi-node.qmd
+            - docs/unsloth.qmd
+            - docs/amd_hpc.qmd
+        - section: "Dataset Formats"
+          contents: docs/dataset-formats/*
+        - section: "Reference"
+          contents:
+            - docs/config.qmd
+        - docs/faq.qmd
+

 format:
  html:
    theme: materia
    css: styles.css
    toc: true
-
-quartodoc:
-  package: axolotl
-  parser: google
-  dir: api
-  sections:
-    - title: Core API
-      desc: Core functionality of Axolotl
-
-metadata-files:
-  - api/_sidebar.yml
--- a/_sidebar.yml
+++ b/_sidebar.yml
@@ -1,17 +0,0 @@
-website:
-  sidebar:
-  - collapse-level: 2
-    contents:
-    - href: introduction.qmd
-      text: Introduction
-    - contents:
-      - reference/index.qmd
-      - contents: []
-        section: axolotl
-      section: Reference
-    - href: basics-summary.qmd
-      text: Basics
-    id: reference
-    search: true
-    style: docked
-  - id: dummy-sidebar
--- a/api/ConstantLengthDataset.qmd
+++ b/api/ConstantLengthDataset.qmd
@@ -1,11 +0,0 @@
-# ConstantLengthDataset { #axolotl.ConstantLengthDataset }
-
-```python
-ConstantLengthDataset(self, tokenizer, datasets, seq_length=2048)
-```
-
-Iterable dataset that returns constant length chunks of tokens from stream of text files.
-    Args:
-        tokenizer (Tokenizer): The processor used for processing the data.
-        dataset (dataset.Dataset): Dataset with text files.
-        seq_length (int): Length of token sequences to return.
--- a/api/TokenizedPromptDataset.qmd
+++ b/api/TokenizedPromptDataset.qmd
@@ -1,19 +0,0 @@
-# TokenizedPromptDataset { #axolotl.TokenizedPromptDataset }
-
-```python
-TokenizedPromptDataset(
-    self,
-    prompt_tokenizer,
-    dataset,
-    process_count=None,
-    keep_in_memory=False,
-    **kwargs,
-)
-```
-
-Dataset that returns tokenized prompts from a stream of text files.
-    Args:
-        prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data.
-        dataset (dataset.Dataset): Dataset with text files.
-        process_count (int): Number of processes to use for tokenizing.
-        keep_in_memory (bool): Whether to keep the tokenized dataset in memory.
--- a/api/choose_config.qmd
+++ b/api/choose_config.qmd
@@ -1,28 +0,0 @@
-# choose_config { #axolotl.choose_config }
-
-```python
-choose_config(path)
-```
-
-Helper method for choosing a `axolotl` config YAML file (considering only files
-ending with `.yml` or `.yaml`). If more than one config file exists in the passed
-`path`, the user is prompted to choose one.
-
-## Parameters {.doc-section .doc-section-parameters}
-
-| Name   | Type   | Description                                   | Default    |
-|--------|--------|-----------------------------------------------|------------|
-| path   | Path   | Directory in which config file(s) are stored. | _required_ |
-
-## Returns {.doc-section .doc-section-returns}
-
-| Name   | Type   | Description                                                                      |
-|--------|--------|----------------------------------------------------------------------------------|
-|        | str    | Path to either (1) the sole YAML file, or (2) if more than one YAML files exist, |
-|        | str    | the user-selected YAML file.                                                     |
-
-## Raises {.doc-section .doc-section-raises}
-
-| Name   | Type       | Description                                     |
-|--------|------------|-------------------------------------------------|
-|        | ValueError | If no YAML files are found in the given `path`. |
--- a/api/index.qmd
+++ b/api/index.qmd
@@ -1,5 +0,0 @@
-# Function reference {.doc .doc-index}
-
-## Core API
-
-Core functionality of Axolotl
--- a/api/load_cfg.qmd
+++ b/api/load_cfg.qmd
@@ -1,21 +0,0 @@
-# load_cfg { #axolotl.load_cfg }
-
-```python
-load_cfg(config=Path('examples/'), **kwargs)
-```
-
-Loads the `axolotl` configuration stored at `config`, validates it, and performs
-various setup.
-
-## Parameters {.doc-section .doc-section-parameters}
-
-| Name   | Type               | Description                                                  | Default             |
-|--------|--------------------|--------------------------------------------------------------|---------------------|
-| config | Union\[str, Path\] | Path (local or remote) to `axolotl` config YAML file.        | `Path('examples/')` |
-| kwargs |                    | Additional keyword arguments to override config file values. | `{}`                |
-
-## Returns {.doc-section .doc-section-returns}
-
-| Name   | Type        | Description                                         |
-|--------|-------------|-----------------------------------------------------|
-|        | DictDefault | `DictDefault` mapping configuration keys to values. |
--- a/api/validate_config.qmd
+++ b/api/validate_config.qmd
@@ -1,5 +0,0 @@
-# validate_config { #axolotl.validate_config }
-
-```python
-validate_config(cfg, capabilities=None, env_capabilities=None)
-```
--- a/objects.json
+++ b/objects.json
@@ -1 +0,0 @@
-{"project": "axolotl", "version": "0.0.9999", "count": 0, "items": []}
--- a/reference/index.qmd
+++ b/reference/index.qmd
@@ -1,3 +0,0 @@
-# API Reference {.doc .doc-index}
-
-## Core API
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,5 +2,3 @@ pre-commit
 black
 mypy
 types-requests
-quartodoc
-quarto-cli
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -2,20 +2,6 @@

 import pkgutil

-from .cli.config import choose_config, load_cfg, validate_config
-from .datasets import ConstantLengthDataset, TokenizedPromptDataset
-from .evaluate import evaluate
-from .train import train
-
 __path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package
-__version__ = "0.6.0"

-__all__ = [
-    "train",
-    "evaluate",
-    "TokenizedPromptDataset",
-    "ConstantLengthDataset",
-    "load_cfg",
-    "choose_config",
-    "validate_config",
-]
+__version__ = "0.6.0"
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -13,6 +13,12 @@ class PreprocessCliArgs:
    debug_num_examples: int = field(default=1)
    prompter: Optional[str] = field(default=None)
    download: Optional[bool] = field(default=True)
+    iterable: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "Use IterableDataset for streaming processing of large datasets"
+        },
+    )


@dataclass
				`@@ -1 +0,0 @@`
				`{"project": "axolotl", "version": "0.0.9999", "count": 0, "items": []}`