Feat: Add Magistral and mistral-common tokenizer support (#2780)

2025-06-12 16:18:33 -07:00
parent ace9287c96
commit eac4a61f55
15 changed files with 1213 additions and 14 deletions
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -27,6 +27,8 @@ trust_remote_code:
 tokenizer_use_fast:
 # Whether to use the legacy tokenizer setting, defaults to True
 tokenizer_legacy:
+# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-common tokenizer.
+tokenizer_use_mistral_common:
 # Resize the model embeddings when new tokens are added to multiples of 32
 # This is reported to improve training speed on some models
 resize_token_embeddings_to_32x: