Compare commits

..

3 Commits

Author SHA1 Message Date
Wing Lian
a9ebff087c remove ref_model when peft model is passed into grpo trainer 2025-02-20 21:53:20 -05:00
NanoCode012
b53a41372f feat: update transformers version to 4.49.0 (#2340) 2025-02-20 21:12:06 -05:00
Wing Lian
02f45e94be calculate sample length fixes and SFT splitting fixes (#2351)
* fix chat template splitting long samples across multiple rows

* make the preprocessing faster
2025-02-20 14:29:58 -05:00
7 changed files with 35 additions and 20 deletions

View File

@@ -13,7 +13,7 @@ liger-kernel==0.5.2
packaging==23.2
peft==0.14.0
transformers==4.48.3
transformers==4.49.0
tokenizers>=0.21.0
accelerate==1.3.0
datasets==3.2.0

View File

@@ -39,6 +39,15 @@ class AxolotlGRPOTrainer(SchedulerMixin, GRPOTrainer):
self.model = self._enable_gradient_checkpointing(self.model, kwargs["args"])
# pylint: enable=access-member-before-definition
# cleanup the ref_model if we have a peft model passed in
# TODO remove this after next major trl release
if (
self.ref_model # pylint: disable=access-member-before-definition
and is_peft_model(self.model)
):
del self.ref_model
self.ref_model = None
def _enable_gradient_checkpointing(
self, model: PreTrainedModel, args: GRPOConfig
) -> PreTrainedModel:

View File

@@ -127,6 +127,8 @@ class ReLoRACallback(TrainerCallback):
optimizer: torch.optim.Optimizer,
**_kwargs,
):
if not optimizer:
optimizer = state.optimizer
if state.global_step > 0 and state.global_step % self.relora_steps == 0:
checkpoint_folder = os.path.join(
args.output_dir,

View File

@@ -272,8 +272,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
dict(zip(feature_names, row))
)
for key, val in tokenized_prompt.items():
for i in range(0, len(val), self.sequence_len):
res[key].append(val[i : i + self.sequence_len])
res[key].append(val)
# If there are no examples left, return an empty dictionary
if not res:

View File

@@ -172,10 +172,11 @@ def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
)
try:
min_input_len = np.min(get_dataset_lengths(dataset))
LOG.debug(f"min_input_len: {min_input_len}")
max_input_len = np.max(get_dataset_lengths(dataset))
LOG.debug(f"max_input_len: {max_input_len}")
ds_lengths = get_dataset_lengths(dataset, from_arrow=True)
min_input_len = np.min(ds_lengths)
LOG.info(f"min_input_len: {min_input_len}")
max_input_len = np.max(ds_lengths)
LOG.info(f"max_input_len: {max_input_len}")
except AttributeError:
pass

View File

@@ -4,13 +4,17 @@ helper util to calculate dataset lengths
import numpy as np
def get_dataset_lengths(dataset):
if "length" in dataset.data.column_names:
lengths = np.array(dataset.data.column("length"))
elif "position_ids" in dataset.data.column_names:
position_ids = dataset.data.column("position_ids")
def get_dataset_lengths(dataset, from_arrow=False):
if "length" in dataset.column_names:
lengths = np.array(dataset["length"])
elif "position_ids" in dataset.column_names:
position_ids = dataset["position_ids"]
lengths = np.array([x[-1] + 1 for x in position_ids])
else:
input_ids = dataset.data.column("input_ids")
lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
if from_arrow:
input_ids = dataset.data.column("input_ids")
lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
else:
input_ids = dataset["input_ids"]
lengths = np.array([len(seq) for seq in input_ids])
return lengths

View File

@@ -7,6 +7,7 @@ from transformers import AutoTokenizer
from axolotl.datasets import TokenizedPromptDataset
from axolotl.prompt_strategies.completion import load
from axolotl.utils.collators import V2BatchSamplerDataCollatorForSeq2Seq
from axolotl.utils.data.utils import drop_long_seq_in_dataset
from axolotl.utils.dict import DictDefault
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
@@ -18,11 +19,6 @@ def fixture_tokenizer():
return tokenizer
@pytest.fixture(name="max_seq_length")
def fixture_max_seq_length():
return 4096
class TestBatchedSamplerPacking:
"""
Test class for packing streaming dataset sequences
@@ -37,6 +33,7 @@ class TestBatchedSamplerPacking:
(2, 2),
],
)
@pytest.mark.parametrize("max_seq_length", [4096, 512])
def test_packing(self, batch_size, num_workers, tokenizer, max_seq_length):
import axolotl.monkeypatch.data.batch_dataset_fetcher # pylint: disable=unused-import # noqa: F401
@@ -62,6 +59,9 @@ class TestBatchedSamplerPacking:
dataset,
)
train_dataset = concatenate_datasets([dataset_wrapper])
train_dataset = drop_long_seq_in_dataset(train_dataset, cfg)
lengths = get_dataset_lengths(train_dataset)
batch_sampler = MultipackBatchSampler(
sampler=RandomSampler(train_dataset),
@@ -90,7 +90,7 @@ class TestBatchedSamplerPacking:
batch_idxs.extend(pack)
for batch in loader:
assert len(batch["input_ids"]) <= batch_size * max_seq_length
assert batch["input_ids"].numel() <= batch_size * max_seq_length
assert batch["input_ids"].shape[1] == max_seq_length
original_idxs = set(range(len(train_dataset)))