Release update 20250331 (#2460) [skip ci]

* make torch 2.6.0 the default image

* fix tests against upstream main

* fix attribute access

* use fixture dataset

* fix dataset load

* correct the fixtures + tests

* more fixtures

* add accidentally removed shakespeare fixture

* fix conversion from unittest to pytest class

* nightly main ci caches

* build 12.6.3 cuda base image

* override for fix from huggingface/transformers#37162

* address PR feedback
This commit is contained in:
Wing Lian
2025-04-01 08:47:50 -04:00
committed by GitHub
parent 328d598114
commit e0aba74dd0
17 changed files with 347 additions and 169 deletions

View File

@@ -238,21 +238,22 @@ class TestDeduplicateRLDataset:
@enable_hf_offline
def test_load_with_deduplication(
self, cfg, dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff, tokenizer_huggyllama
self,
cfg,
dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
tokenizer_huggyllama,
):
"""Verify that loading with deduplication removes duplicates."""
# pylint: disable=duplicate-code
with (
patch(
"axolotl.utils.data.shared.load_dataset_w_config"
) as mock_load_dataset,
patch("axolotl.utils.data.rl.load_dataset_w_config") as mock_load_dataset,
patch("axolotl.utils.models.load_tokenizer") as mock_load_tokenizer,
):
# Set up the mock to return different values on successive calls
mock_load_dataset.side_effect = [
dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff,
dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff,
dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
]
mock_load_tokenizer.return_value = tokenizer_huggyllama
@@ -263,19 +264,20 @@ class TestDeduplicateRLDataset:
@enable_hf_offline
def test_load_without_deduplication(
self, cfg, dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff, tokenizer_huggyllama
self,
cfg,
dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
tokenizer_huggyllama,
):
# pylint: disable=duplicate-code
with (
patch(
"axolotl.utils.data.shared.load_dataset_w_config"
) as mock_load_dataset,
patch("axolotl.utils.data.rl.load_dataset_w_config") as mock_load_dataset,
patch("axolotl.utils.models.load_tokenizer") as mock_load_tokenizer,
):
# Set up the mock to return different values on successive calls
mock_load_dataset.side_effect = [
dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff,
dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff,
dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
]
mock_load_tokenizer.return_value = tokenizer_huggyllama