From 30150fe1e186ea31c85415aada49f4a4284d1f24 Mon Sep 17 00:00:00 2001 From: mhenrichsen Date: Tue, 6 May 2025 10:11:06 +0200 Subject: [PATCH] Adds example for training a TTS model on top of a LLM. (#2614) * Adds example for training a TTS model on top of a LLM. * Update examples/orpheus/finetune.yml Co-authored-by: NanoCode012 * Update examples/orpheus/finetune.yml Co-authored-by: NanoCode012 * Update README.md to clarify GPU requirements for finetuning Orpheus TTS model * Update finetune.yml to use the new base model canopylabs/orpheus-3b-0.1-pretrained * Update finetune.yml and README.md for consistency and clarity --------- Co-authored-by: NanoCode012 --- examples/orpheus/README.md | 341 ++++++++++++++++++++++++++++++++++ examples/orpheus/finetune.yml | 52 ++++++ 2 files changed, 393 insertions(+) create mode 100644 examples/orpheus/README.md create mode 100644 examples/orpheus/finetune.yml diff --git a/examples/orpheus/README.md b/examples/orpheus/README.md new file mode 100644 index 000000000..5fea05ecf --- /dev/null +++ b/examples/orpheus/README.md @@ -0,0 +1,341 @@ +# Finetuning LLMs to output audio + +In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio. + +The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB. + +## Dataset pre-processing for pre-training +If you are adding another voice in English, please jump ahead to finetuning pre-processing. + +For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer. + +Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset. + +```python +import torch +from snac import SNAC +from datasets import load_dataset +from huggingface_hub import snapshot_download +from datasets import load_dataset +import random +import torchaudio.transforms as T +from transformers import AutoTokenizer +import os + +my_original_dataset_name = "" +name_to_push_dataset_to = "" + +dsn = my_original_dataset_name + +snapshot_download( + repo_id=dsn, + repo_type="dataset", + revision="main", + max_workers=64, +) + + +ds = load_dataset(dsn, split="train") +ds_sample_rate = ds[0]["audio"]["sampling_rate"] + +model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") +model = model.to("mps") + +def tokenise_audio(waveform): + waveform = torch.from_numpy(waveform).unsqueeze(0) + waveform = waveform.to(dtype=torch.float32) + resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000) + waveform = resample_transform(waveform) + + waveform = waveform.unsqueeze(0).to("cuda") + + #generate the codes from snac + with torch.inference_mode(): + codes = model.encode(waveform) + + all_codes = [] + for i in range(codes[0].shape[1]): + all_codes.append(codes[0][0][i].item()+128266) + all_codes.append(codes[1][0][2*i].item()+128266+4096) + all_codes.append(codes[2][0][4*i].item()+128266+(2*4096)) + all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096)) + all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096)) + all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096)) + all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096)) + + + return all_codes + +def add_codes(example): + # Always initialize codes_list to None + codes_list = None + + try: + answer_audio = example.get("audio") + # If there's a valid audio array, tokenise it + if answer_audio and "array" in answer_audio: + audio_array = answer_audio["array"] + codes_list = tokenise_audio(audio_array) + except Exception as e: + print(f"Skipping row due to error: {e}") + # Keep codes_list as None if we fail + example["codes_list"] = codes_list + + return example + +ds = ds.map(add_codes, remove_columns=["audio"]) + +#@title Load Tokenizer +tokeniser_length = 128256 +start_of_text = 128000 +end_of_text = 128009 + +start_of_speech = tokeniser_length + 1 +end_of_speech = tokeniser_length + 2 + +start_of_human = tokeniser_length + 3 +end_of_human = tokeniser_length + 4 + +start_of_ai = tokeniser_length + 5 +end_of_ai = tokeniser_length + 6 +pad_token = tokeniser_length + 7 + +audio_tokens_start = tokeniser_length + 10 + +tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained" + + +tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) +num_proc = os.cpu_count() - 2 + +ds = ds.filter(lambda x: x["codes_list"] is not None) +ds = ds.filter(lambda x: len(x["codes_list"]) > 0) + +#@title Create Input Ids +def remove_duplicate_frames(example): + vals = example["codes_list"] + if len(vals) % 7 != 0: + raise ValueError("Input list length must be divisible by 7") + + result = vals[:7] + + removed_frames = 0 + + for i in range(7, len(vals), 7): + current_first = vals[i] + previous_first = result[-7] + + if current_first != previous_first: + result.extend(vals[i:i+7]) + else: + removed_frames += 1 + + example["codes_list"] = result + + return example + +ds = ds.map(remove_duplicate_frames, num_proc=num_proc) + + +def create_input_ids(example): + text_ids = tokenizer.encode({example['text']}, add_special_tokens=True) + text_ids.append(end_of_text) + example["text_tokens"] = text_ids + input_ids = ( + [start_of_human] + + example["text_tokens"] + + [end_of_human] + + [start_of_ai] + + [start_of_speech] + + example["codes_list"] + + [end_of_speech] + + [end_of_ai] + ) + example["input_ids"] = input_ids + example["labels"] = input_ids + example["attention_mask"] = [1] * len(input_ids) + + return example + +ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"]) + +#@title Remove unnecessary columns +columns_to_keep = ["input_ids", "labels", "attention_mask"] +columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep] + +ds = ds.remove_columns(columns_to_remove) + +ds.push_to_hub(name_to_push_dataset_to) +``` + + +## Finetune pre-processing +Use this code to add a new voice. + +```python +import torch +from snac import SNAC +from datasets import load_dataset +from huggingface_hub import snapshot_download +from datasets import load_dataset +import random +import torchaudio.transforms as T +from transformers import AutoTokenizer +import os + +my_original_dataset_name = "" +name_to_push_dataset_to = "" + +dsn = my_original_dataset_name + +snapshot_download( + repo_id=dsn, + repo_type="dataset", + revision="main", + max_workers=64, +) + + +ds = load_dataset(dsn, split="train") +ds_sample_rate = ds[0]["audio"]["sampling_rate"] + +model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") +model = model.to("mps") + +def tokenise_audio(waveform): + waveform = torch.from_numpy(waveform).unsqueeze(0) + waveform = waveform.to(dtype=torch.float32) + resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000) + waveform = resample_transform(waveform) + + waveform = waveform.unsqueeze(0).to("cuda") + + #generate the codes from snac + with torch.inference_mode(): + codes = model.encode(waveform) + + all_codes = [] + for i in range(codes[0].shape[1]): + all_codes.append(codes[0][0][i].item()+128266) + all_codes.append(codes[1][0][2*i].item()+128266+4096) + all_codes.append(codes[2][0][4*i].item()+128266+(2*4096)) + all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096)) + all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096)) + all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096)) + all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096)) + + + return all_codes + +def add_codes(example): + # Always initialize codes_list to None + codes_list = None + + try: + answer_audio = example.get("audio") + # If there's a valid audio array, tokenise it + if answer_audio and "array" in answer_audio: + audio_array = answer_audio["array"] + codes_list = tokenise_audio(audio_array) + except Exception as e: + print(f"Skipping row due to error: {e}") + # Keep codes_list as None if we fail + example["codes_list"] = codes_list + + return example + +ds = ds.map(add_codes, remove_columns=["audio"]) + +#@title Load Tokenizer +tokeniser_length = 128256 +start_of_text = 128000 +end_of_text = 128009 + +start_of_speech = tokeniser_length + 1 +end_of_speech = tokeniser_length + 2 + +start_of_human = tokeniser_length + 3 +end_of_human = tokeniser_length + 4 + +start_of_ai = tokeniser_length + 5 +end_of_ai = tokeniser_length + 6 +pad_token = tokeniser_length + 7 + +audio_tokens_start = tokeniser_length + 10 + +tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained" + + +tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) +num_proc = os.cpu_count() - 2 + +ds = ds.filter(lambda x: x["codes_list"] is not None) +ds = ds.filter(lambda x: len(x["codes_list"]) > 0) + +#@title Create Input Ids +def remove_duplicate_frames(example): + vals = example["codes_list"] + if len(vals) % 7 != 0: + raise ValueError("Input list length must be divisible by 7") + + result = vals[:7] + + removed_frames = 0 + + for i in range(7, len(vals), 7): + current_first = vals[i] + previous_first = result[-7] + + if current_first != previous_first: + result.extend(vals[i:i+7]) + else: + removed_frames += 1 + + example["codes_list"] = result + + return example + +ds = ds.map(remove_duplicate_frames, num_proc=num_proc) + +tok_info = '''*** HERE you can modify the text prompt +i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass: +f"{example["source"]}: {example["text"]}", as is passed. +''' +print(tok_info) + +def create_input_ids(example): + text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}", add_special_tokens=True) + text_ids.append(end_of_text) + example["text_tokens"] = text_ids + input_ids = ( + [start_of_human] + + example["text_tokens"] + + [end_of_human] + + [start_of_ai] + + [start_of_speech] + + example["codes_list"] + + [end_of_speech] + + [end_of_ai] + ) + example["input_ids"] = input_ids + example["labels"] = input_ids + example["attention_mask"] = [1] * len(input_ids) + + return example + +ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"]) + +#@title Remove unnecessary columns +columns_to_keep = ["input_ids", "labels", "attention_mask"] +columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep] + +ds = ds.remove_columns(columns_to_remove) + +ds.push_to_hub(name_to_push_dataset_to) +``` + +## Training +After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml` + +## Inference +For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main). diff --git a/examples/orpheus/finetune.yml b/examples/orpheus/finetune.yml new file mode 100644 index 000000000..9bcbbeee0 --- /dev/null +++ b/examples/orpheus/finetune.yml @@ -0,0 +1,52 @@ +base_model: canopylabs/orpheus-3b-0.1-pretrained + +hub_model_id: + +plugins: + - axolotl.integrations.liger.LigerPlugin +liger_rope: true +liger_rms_norm: true +liger_glu_activation: true +liger_fused_linear_cross_entropy: true + +datasets: + - path: + type: # leave empty to load pre-tokenized +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +output_dir: ./outputs/out + +sequence_len: 8192 +sample_packing: true +pad_to_sequence_len: true + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 8 +micro_batch_size: 4 +num_epochs: 3 +optimizer: adamw_torch_fused +lr_scheduler: cosine +learning_rate: 2e-5 + +bf16: auto +tf32: false + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_steps: 20 +evals_per_epoch: 5 +saves_per_epoch: 5 +weight_decay: 0.05 + +special_tokens: + pad_token: