diff --git a/.nojekyll b/.nojekyll index 22f7b7aed..476d4c53d 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -7964ed8f \ No newline at end of file +c2c0a3d8 \ No newline at end of file diff --git a/docs/dataset-formats/conversation.html b/docs/dataset-formats/conversation.html index 5ac0666ba..c5336d9ab 100644 --- a/docs/dataset-formats/conversation.html +++ b/docs/dataset-formats/conversation.html @@ -805,14 +805,16 @@ Warning
If you have tool arguments with same name but different dtypes (like "time": string and "time": number), please save arguments: as JSON string to prevent datasets from having casting issues.
"arguments": "{\"...\": \"...\"}"
+The same is applicable for tool parameters.
+"parameters": "{\"...\": \"...\"}"
Example config for Llama4:
-chat_template: llama4
-datasets:
- - path: Nanobit/text-tools-2k-test
- type: chat_template
- # field_tools: tools # default is `tools`chat_template: llama4
+datasets:
+ - path: Nanobit/text-tools-2k-test
+ type: chat_template
+ # field_tools: tools # default is `tools`The configuration would look like:
-datasets:
- - path: ...
- type: chat_template
- chat_template: tokenizer_default
- field_messages: conversations
- message_property_mappings:
- role: from
- content: value
- roles_to_train: []
- train_on_eos: turn
- message_field_training: train
- message_field_training_detail: train_detaildatasets:
+ - path: ...
+ type: chat_template
+ chat_template: tokenizer_default
+ field_messages: conversations
+ message_property_mappings:
+ role: from
+ content: value
+ roles_to_train: []
+ train_on_eos: turn
+ message_field_training: train
+ message_field_training_detail: train_detailAfter split, it will look like:
+{
+ "reasoning_content": "Some thinking outputs",
+ "content": "Output after thinking..."
+}data.jsonl
{"conversations": [{"role": "...", "value": "..."}]}{"conversations": [{"role": "...", "value": "..."}]}