This commit is contained in:
Dan Saunders
2025-06-09 20:38:13 +00:00
parent 70e9cb545d
commit 8f75136ad3
5 changed files with 7 additions and 12 deletions

View File

@@ -20,6 +20,7 @@ datasets==3.6.0
deepspeed>=0.17.0 deepspeed>=0.17.0
trl==0.18.1 trl==0.18.1
hf_xet==1.1.2 hf_xet==1.1.2
mistral-common[hf-hub]==1.6.0
optimum==1.16.2 optimum==1.16.2
hf_transfer hf_transfer
@@ -67,5 +68,3 @@ schedulefree==1.4.1
axolotl-contribs-lgpl==0.0.6 axolotl-contribs-lgpl==0.0.6
axolotl-contribs-mit==0.0.3 axolotl-contribs-mit==0.0.3
mistral-common[hf-hub]==1.6.0

View File

@@ -64,6 +64,8 @@ class TokenizedPromptDataset(Dataset):
desc="Strategy Filtering Rows", desc="Strategy Filtering Rows",
) )
import ipdb; ipdb.set_trace()
return dataset.map( return dataset.map(
self.prompt_tokenizer.tokenize_prompt, self.prompt_tokenizer.tokenize_prompt,
num_proc=num_proc, num_proc=num_proc,

View File

@@ -10,7 +10,6 @@ from huggingface_hub import hf_hub_download
from mistral_common.protocol.instruct.messages import SystemMessage, UserMessage from mistral_common.protocol.instruct.messages import SystemMessage, UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import ( from mistral_common.tokens.tokenizers.mistral import (
MODEL_NAME_TO_TOKENIZER_CLS,
MistralTokenizer, MistralTokenizer,
) )
from transformers import ( from transformers import (
@@ -366,15 +365,6 @@ class TokenizerConfiguration:
self.cfg = cfg self.cfg = cfg
self.model_config = load_model_config(cfg) self.model_config = load_model_config(cfg)
def detect_by_model_name_mapping(self) -> bool:
# Extract model name from path
model = self.cfg.base_model.split("/")[-1]
for model_name in MODEL_NAME_TO_TOKENIZER_CLS.keys():
if model_name in model.lower():
return True
return False
def load_mistral_tokenizer(self) -> MistralTokenizerWrapper: def load_mistral_tokenizer(self) -> MistralTokenizerWrapper:
"""Load Mistral tokenizer from model configuration.""" """Load Mistral tokenizer from model configuration."""
# Instantiate Mistral tokenizer # Instantiate Mistral tokenizer

View File

@@ -67,6 +67,8 @@ class PromptTokenizingStrategy(abc.ABC):
LOG.warning("Empty text requested for tokenization.") LOG.warning("Empty text requested for tokenization.")
return empty return empty
import ipdb; ipdb.set_trace()
result = self.tokenizer( result = self.tokenizer(
prompt, prompt,
truncation=True, truncation=True,

View File

@@ -486,6 +486,8 @@ def get_dataset_wrapper(
f"Loading dataset: {config_dataset['path']} with base_type: {d_base_type} and prompt_style: {d_prompt_style}" f"Loading dataset: {config_dataset['path']} with base_type: {d_base_type} and prompt_style: {d_prompt_style}"
) )
import ipdb; ipdb.set_trace()
if ( if (
isinstance(dataset, Dataset) isinstance(dataset, Dataset)
and "input_ids" in dataset.features and "input_ids" in dataset.features