Merge branch 'main' of https://github.com/OpenAccess-AI-Collective/axolotl into qlora-openllama-3b-example

2023-05-29 09:09:43 -05:00
parent 370d057096 00323f0a6f
commit f1fbf666f7
4 changed files with 17 additions and 11 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -1,4 +1,4 @@
-name: ci-cd
+name: ci-cd-base
 on:
  push:
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -62,6 +62,7 @@ RUN git clone https://github.com/microsoft/DeepSpeed.git && \
 FROM base-builder AS bnb-builder
 WORKDIR /workspace
 ENV CUDA_VERSION_BNB=$CUDA_VERSION_BNB
 RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
    cd bitsandbytes && \
@@ -70,6 +71,8 @@ RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
 FROM base-builder
 ENV CUDA_VERSION_BNB=$CUDA_VERSION_BNB
 # recompile apex
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -178,6 +178,15 @@ def train(
            tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
        )
    if cfg.debug or "debug" in kwargs:
        logging.info("check_dataset_labels...")
        check_dataset_labels(
            train_dataset.select(
                [random.randrange(0, len(train_dataset) - 1) for i in range(5)]
            ),
            tokenizer,
        )
    if prepare_ds_only:
        logging.info("Finished preparing dataset. Exiting...")
        return
@@ -213,15 +222,6 @@ def train(
        model.save_pretrained(cfg.output_dir)
        return
    if cfg.debug:
        logging.info("check_dataset_labels...")
        check_dataset_labels(
            train_dataset.select(
                [random.randrange(0, len(train_dataset) - 1) for i in range(5)]
            ),
            tokenizer,
        )
    trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
    model.config.use_cache = False
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -268,6 +268,9 @@ class AlpacaReflectionPTStrategy(ReflectionPromptTokenizingStrategy):
 class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
    def get_conversation_thread(self, prompt):
        return prompt["conversations"]
    def tokenize_prompt(self, prompt):
        result = {
            "input_ids": [],
@@ -279,7 +282,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
        assistant_token = self._get_assistant_token()
        try:
            for i, part in enumerate(
-                self.prompter.build_prompt(prompt["conversations"])
+                self.prompter.build_prompt(self.get_conversation_thread(prompt))
            ):
                if isinstance(part, tuple):
                    if part[0] == "USER:":