fix: handle sharegpt dataset missing (#2035)

* fix: handle sharegpt dataset missing

* fix: explanation

* feat: add test
This commit is contained in:
NanoCode012
2024-11-12 12:51:37 +07:00
committed by GitHub
parent 3931a42763
commit 9f1cf9b17c
2 changed files with 62 additions and 1 deletions

View File

@@ -234,3 +234,59 @@ class TestValidationCheckDatasetConfig(BaseValidation):
)
_check_config()
def test_dataset_sharegpt_deprecation(self, minimal_cfg):
cfg = DictDefault(
minimal_cfg
| {
"chat_template": "chatml",
"datasets": [
{
"path": "LDJnr/Puffin",
"type": "sharegpt",
"conversation": "chatml",
}
],
}
)
# Check sharegpt deprecation is raised
with pytest.raises(ValueError, match=r".*type: sharegpt.*` is deprecated.*"):
validate_config(cfg)
# Check that deprecation is not thrown for non-str type
cfg = DictDefault(
minimal_cfg
| {
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": {
"field_instruction": "instruction",
"field_output": "output",
"field_system": "system",
"format": "<|user|> {instruction} {input} <|model|>",
"no_input_format": "<|user|> {instruction} <|model|>",
"system_prompt": "",
},
}
],
}
)
validate_config(cfg)
# Check that deprecation is not thrown for non-sharegpt type
cfg = DictDefault(
minimal_cfg
| {
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": "alpaca",
}
],
}
)
validate_config(cfg)