debug
This commit is contained in:
@@ -20,6 +20,7 @@ datasets==3.6.0
|
|||||||
deepspeed>=0.17.0
|
deepspeed>=0.17.0
|
||||||
trl==0.18.1
|
trl==0.18.1
|
||||||
hf_xet==1.1.2
|
hf_xet==1.1.2
|
||||||
|
mistral-common[hf-hub]==1.6.0
|
||||||
|
|
||||||
optimum==1.16.2
|
optimum==1.16.2
|
||||||
hf_transfer
|
hf_transfer
|
||||||
@@ -67,5 +68,3 @@ schedulefree==1.4.1
|
|||||||
|
|
||||||
axolotl-contribs-lgpl==0.0.6
|
axolotl-contribs-lgpl==0.0.6
|
||||||
axolotl-contribs-mit==0.0.3
|
axolotl-contribs-mit==0.0.3
|
||||||
|
|
||||||
mistral-common[hf-hub]==1.6.0
|
|
||||||
|
|||||||
@@ -64,6 +64,8 @@ class TokenizedPromptDataset(Dataset):
|
|||||||
desc="Strategy Filtering Rows",
|
desc="Strategy Filtering Rows",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
import ipdb; ipdb.set_trace()
|
||||||
|
|
||||||
return dataset.map(
|
return dataset.map(
|
||||||
self.prompt_tokenizer.tokenize_prompt,
|
self.prompt_tokenizer.tokenize_prompt,
|
||||||
num_proc=num_proc,
|
num_proc=num_proc,
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ from huggingface_hub import hf_hub_download
|
|||||||
from mistral_common.protocol.instruct.messages import SystemMessage, UserMessage
|
from mistral_common.protocol.instruct.messages import SystemMessage, UserMessage
|
||||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||||
from mistral_common.tokens.tokenizers.mistral import (
|
from mistral_common.tokens.tokenizers.mistral import (
|
||||||
MODEL_NAME_TO_TOKENIZER_CLS,
|
|
||||||
MistralTokenizer,
|
MistralTokenizer,
|
||||||
)
|
)
|
||||||
from transformers import (
|
from transformers import (
|
||||||
@@ -366,15 +365,6 @@ class TokenizerConfiguration:
|
|||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
self.model_config = load_model_config(cfg)
|
self.model_config = load_model_config(cfg)
|
||||||
|
|
||||||
def detect_by_model_name_mapping(self) -> bool:
|
|
||||||
# Extract model name from path
|
|
||||||
model = self.cfg.base_model.split("/")[-1]
|
|
||||||
for model_name in MODEL_NAME_TO_TOKENIZER_CLS.keys():
|
|
||||||
if model_name in model.lower():
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def load_mistral_tokenizer(self) -> MistralTokenizerWrapper:
|
def load_mistral_tokenizer(self) -> MistralTokenizerWrapper:
|
||||||
"""Load Mistral tokenizer from model configuration."""
|
"""Load Mistral tokenizer from model configuration."""
|
||||||
# Instantiate Mistral tokenizer
|
# Instantiate Mistral tokenizer
|
||||||
|
|||||||
@@ -67,6 +67,8 @@ class PromptTokenizingStrategy(abc.ABC):
|
|||||||
LOG.warning("Empty text requested for tokenization.")
|
LOG.warning("Empty text requested for tokenization.")
|
||||||
return empty
|
return empty
|
||||||
|
|
||||||
|
import ipdb; ipdb.set_trace()
|
||||||
|
|
||||||
result = self.tokenizer(
|
result = self.tokenizer(
|
||||||
prompt,
|
prompt,
|
||||||
truncation=True,
|
truncation=True,
|
||||||
|
|||||||
@@ -486,6 +486,8 @@ def get_dataset_wrapper(
|
|||||||
f"Loading dataset: {config_dataset['path']} with base_type: {d_base_type} and prompt_style: {d_prompt_style}"
|
f"Loading dataset: {config_dataset['path']} with base_type: {d_base_type} and prompt_style: {d_prompt_style}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
import ipdb; ipdb.set_trace()
|
||||||
|
|
||||||
if (
|
if (
|
||||||
isinstance(dataset, Dataset)
|
isinstance(dataset, Dataset)
|
||||||
and "input_ids" in dataset.features
|
and "input_ids" in dataset.features
|
||||||
|
|||||||
Reference in New Issue
Block a user