From 54b542d312b6ddf27de73e56f59dc6624698696a Mon Sep 17 00:00:00 2001 From: mhenrhcsen Date: Tue, 12 Aug 2025 21:09:40 +0200 Subject: [PATCH] remove unused files --- gsp.yml | 58 --- kb.py | 584 -------------------------- src/axolotl/utils/callbacks/orph3.yml | 57 --- 3 files changed, 699 deletions(-) delete mode 100644 gsp.yml delete mode 100644 kb.py delete mode 100644 src/axolotl/utils/callbacks/orph3.yml diff --git a/gsp.yml b/gsp.yml deleted file mode 100644 index f33ce518a..000000000 --- a/gsp.yml +++ /dev/null @@ -1,58 +0,0 @@ -base_model: ibm-granite/granite-speech-3.3-2b -# Remove model_type to let Axolotl auto-detect the correct model type -# model_type: GraniteSpeechForConditionalGeneration -# Enable trust_remote_code to use the model's custom code -trust_remote_code: true -# Mark as multimodal since this is a speech model -is_multimodal: true - -hub_model_id: syvai/gsp - -plugins: - - axolotl.integrations.liger.LigerPlugin -liger_rope: true -liger_rms_norm: true -liger_glu_activation: true -liger_fused_linear_cross_entropy: true - -datasets: - - path: syvai/coral-tts-asr - type: # leave empty to load pre-tokenized -dataset_prepared_path: last_run_prepared -val_set_size: 0.02 -output_dir: ./outputs/out -eval_sample_packing: False - -sequence_len: 8192 -sample_packing: true -pad_to_sequence_len: true - -wandb_project: -wandb_entity: -wandb_watch: -wandb_name: -wandb_log_model: - -gradient_accumulation_steps: 16 -micro_batch_size: 1 -num_epochs: 3 -optimizer: adamw_torch_fused -lr_scheduler: cosine -learning_rate: 2e-5 - -bf16: auto -tf32: false - -gradient_checkpointing: true -gradient_checkpointing_kwargs: - use_reentrant: false -resume_from_checkpoint: -logging_steps: 1 -flash_attention: true - -warmup_steps: 20 -evals_per_epoch: 5 -saves_per_epoch: 5 -weight_decay: 0.05 - -#save_first_step: true # uncomment this to validate checkpoint saving works with your config \ No newline at end of file diff --git a/kb.py b/kb.py deleted file mode 100644 index f709c035e..000000000 --- a/kb.py +++ /dev/null @@ -1,584 +0,0 @@ -import requests -import json -import urllib.parse -import time -import os -import subprocess -import pymongo -from faster_whisper import WhisperModel, BatchedInferencePipeline -import librosa -import soundfile as sf -import torch -import torchaudio.transforms as T -from snac import SNAC - -MONGO_URI = "mongodb://root:9AsYmXYKmYLHcNsShmCb3L5DZMXH77rQ9GBRxm0HKownNWLwdzH9dW7zhPG9mpuR@46.4.101.229:8281/?directConnection=true" -COLLECTION_NAME = "tts_data" - -device = torch.device("cuda" if torch.cuda.is_available() else "mps") -client = pymongo.MongoClient(MONGO_URI) -db = client["tts_data"] -collection = db[COLLECTION_NAME] - -model = WhisperModel("deepdml/faster-whisper-large-v3-turbo-ct2") -batched_model = BatchedInferencePipeline(model) - -snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") -snac_model = snac_model.to(device) - - -class ApiService: - def __init__(self): - self.client = requests.Session() - self.auth_cookie = None - self.kb_domain = "www.kb.dk" - self.api_domain = "api.kaltura.nordu.net" - self.ds_api_domain = "www.kb.dk" - self.kaltura_partner_id = "397" - self.kaltura_widget_id = "_397" - self.kaltura_player_version = "html5:v3.14.4" - - def fetch_data(self, url): - """Henter rå tekstdata fra en given URL.""" - headers = {'User-Agent': 'Mozilla/5.0'} - - if self.auth_cookie: - headers['Cookie'] = self.auth_cookie - - try: - response = self.client.get(url, headers=headers) - response.raise_for_status() - return response.text - except requests.RequestException as e: - print(f"Kunne ikke hente data fra {url}: {e}") - return None - - def _generate_kaltura_stream_link(self, entry_id: str, flavor_id: str, file_ext: str) -> str: - """ - Genererer et komplet Kaltura stream-link ud fra entryId, flavorId og filendelse. - """ - return ( - f"https://vod-cache.kaltura.nordu.net/p/{self.kaltura_partner_id}/sp/{self.kaltura_partner_id}00/serveFlavor/" - f"entryId/{entry_id}/v/12/flavorId/{flavor_id}/name/a.{file_ext}" - ) - - def extract_media_url_from_kaltura_response(self, response_data): - """ - Udtrækker media URL. Bruger nu altid _generate_kaltura_stream_link for at få en direkte MP4 flavor URL. - Forventer et multirequest-svar fra Kaltura. - """ - try: - data = json.loads(response_data) - # context_object = data[2] # Not strictly needed if we don't use flavor_assets directly from here for HLS - # flavor_assets = context_object.get('flavorAssets', []) # Not strictly needed - sources = data[2].get('sources', []) # Still need sources to get a flavorId - - # We need an entry_id and a flavor_id to build the serveFlavor URL. - # file_ext will be determined by the flavor if possible, or default. - - media_object_list = data[1].get('objects', []) - if not media_object_list: - print("Manglende 'objects' i Kaltura-respons data[1].") - return None - - media_object = media_object_list[0] - entry_id = media_object.get('id', '') - - current_flavor_id = None - file_ext = "mp4" # Default to mp4, can be overridden if flavor asset info is available - - # Try to get flavorId from sources if available - if isinstance(sources, list) and sources: - # Assuming the first source's flavorId is relevant for a downloadable MP4 - # The 'sources' array often contains multiple formats and qualities. - # We need to pick one that is likely to be a simple video file. - # Let's iterate to find one with 'video/mp4' or a common video format - found_flavor_for_mp4 = False - for source_item in sources: - if isinstance(source_item, dict): - s_format = source_item.get('format') - s_mimetype = source_item.get('mimetype') - # Prioritize a flavorId that seems to be for an MP4 - if s_mimetype == 'video/mp4' or s_format == 'url': # 'url' format sometimes links to MP4 - temp_flavor_id = source_item.get('flavorIds') - if temp_flavor_id: # flavorIds can be a string like "0_xxxx,0_yyyy" - current_flavor_id = temp_flavor_id.split(',')[0] # Take the first one - # Check if flavorAssets has more info on this flavorId - flavor_assets = data[2].get('flavorAssets', []) - if isinstance(flavor_assets, list): - for asset in flavor_assets: - if asset.get('id') == current_flavor_id and asset.get('fileExt'): - file_ext = asset.get('fileExt') - break - found_flavor_for_mp4 = True - break - if not found_flavor_for_mp4 and isinstance(sources, list) and sources: # Fallback to first if no explicit mp4 found - current_flavor_id = sources[0].get('flavorIds','').split(',')[0] - - - # If flavorId is still not found, try getting it from flavorAssets as a last resort - # This part of logic might be less reliable as flavorAssets might not directly map - # to a simple downloadable flavor if sources didn't provide one. - if not current_flavor_id: - flavor_assets = data[2].get('flavorAssets', []) - if isinstance(flavor_assets, list) and flavor_assets: - # Heuristic: pick the first flavor asset that is not 'audio*' or 'image*' if possible - # and hope it's a video. - for asset in flavor_assets: - tags = asset.get('tags', '') - if 'audio' not in tags and 'image' not in tags and 'caption' not in tags: # try to avoid non-video - current_flavor_id = asset.get('id') - file_ext = asset.get('fileExt', 'mp4') - break - if not current_flavor_id and flavor_assets: # If still nothing, just take the first one - current_flavor_id = flavor_assets[0].get('id') - file_ext = flavor_assets[0].get('fileExt', 'mp4') - - - if not (entry_id and current_flavor_id): - print(f"Manglende data til at bygge media URL (entry_id: {entry_id}, flavor_id: {current_flavor_id}).") - # Print more context if URL generation fails - print(f" entry_id from data[1]: {entry_id}") - print(f" Attempted current_flavor_id: {current_flavor_id}") - print(f" Sources object: {str(sources)[:200]}...") - print(f" FlavorAssets object: {str(data[2].get('flavorAssets', []))[:200]}...") - return None - - # Ensure file_ext is sensible - if not file_ext or len(file_ext) > 5: # basic sanity check - file_ext = "mp4" - - print(f" Generating serveFlavor URL with entry_id: {entry_id}, flavor_id: {current_flavor_id}, ext: {file_ext}") - media_url = self._generate_kaltura_stream_link(entry_id, current_flavor_id, file_ext) - return media_url - - except (KeyError, IndexError, TypeError, json.JSONDecodeError) as e: - print(f"Kunne ikke parse media-url fra Kaltura-respons: {e}") - print(f"Response data snippet: {str(response_data)[:500]}") - return None - except Exception as e: - print(f"Uventet fejl under parsing af Kaltura-respons: {e}") - return None - - def fetch_kaltura_data(self, entry_id): - """Henter metadata og afspilningsinformation for en specifik Kaltura entry.""" - url = f"https://{self.api_domain}/api_v3/service/multirequest" - json_payload = { - "1": { - "service": "session", - "action": "startWidgetSession", - "widgetId": self.kaltura_widget_id - }, - "2": { - "service": "baseEntry", - "action": "list", - "ks": "{1:result:ks}", - "filter": {"redirectFromEntryId": entry_id}, - "responseProfile": { - "type": 1, - "fields": "id,referenceId,name,duration,description,thumbnailUrl,dataUrl,duration,msDuration,flavorParamsIds,mediaType,type,tags,startTime,date,dvrStatus,externalSourceType,status" - } - }, - "3": { - "service": "baseEntry", - "action": "getPlaybackContext", - "entryId": "{2:result:objects:0:id}", - "ks": "{1:result:ks}", - "contextDataParams": { - "objectType": "KalturaContextDataParams", - "flavorTags": "all" - } - }, - "4": { - "service": "metadata_metadata", - "action": "list", - "filter": { - "objectType": "KalturaMetadataFilter", - "objectIdEqual": "{2:result:objects:0:id}", - "metadataObjectTypeEqual": "1" - }, - "ks": "{1:result:ks}" - }, - "apiVersion": "3.3.0", - "format": 1, - "ks": "", - "clientTag": self.kaltura_player_version, - "partnerId": self.kaltura_partner_id - } - - headers = { - 'Accept': 'application/json, text/plain, */*', - 'Accept-Encoding': 'gzip, deflate, br, zstd', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Host': self.api_domain, - 'Referer': f'https://{self.kb_domain}/find-materiale/dr-arkivet/', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0', - 'Content-Type': 'application/json' - } - - if self.auth_cookie: - headers['Cookie'] = f"Authorization={self.auth_cookie}" - - try: - response = self.client.post(url, json=json_payload, headers=headers) - response.raise_for_status() - # logging.debug(f"Kaltura response for entry {entry_id}: {response.text}") - return response.text - except requests.RequestException as e: - print(f"Kunne ikke hente Kaltura-data for entry {entry_id}: {e}") - return None - - def authenticate(self, on_complete): - """ - Udfører autentifikation mod KB-API'en og gemmer auth-cookie til senere brug. - 'on_complete' er en callback-funktion, der kaldes uanset resultat. - """ - current_unix_time = int(time.time()) - - cookie_header = ( - f"""ppms_privacy_6c58358e-1595-4533-8cf8-9b1c061871d0={{"visitorId":"0478c604-ce60-4537-8e17-fdb53fcd5c31","domain":{{"normalized":"{self.kb_domain}","isWildcard":false,"pattern":"{self.kb_domain}"}},"consents":{{"analytics":{{"status":1}}}}}}; """ - f"""CookieScriptConsent={{"bannershown":1,"action":"reject","consenttime":{current_unix_time},"categories":"[]","key":"99a8bf43-ba89-444c-9333-2971c53e72a6"}}""" - ) - - auth_url = f"https://{self.ds_api_domain}/ds-api/bff/v1/authenticate/" - headers = { - 'Accept': 'application/json, text/plain, */*', - 'Cookie': cookie_header, - 'Referer': f'https://{self.kb_domain}/find-materiale/dr-arkivet/' - } - - try: - response = self.client.get(auth_url, headers=headers) - response.raise_for_status() - cookies = response.cookies.get_dict() - auth_cookie = cookies.get("Authorization") - if auth_cookie: - self.auth_cookie = auth_cookie - print("Autentificering gennemført og auth-cookie gemt.") - else: - print("Ingen Authorization-cookie fundet i svaret.") - except requests.RequestException as e: - print(f"Autentificering mislykkedes: {e}") - finally: - on_complete() - - def fetch_search_results(self, search_term="*:*", start_index=0, sort_option="startTime asc", rows=10, media_type="", year_start=2005, year_end=2026, month_number=1): - """ - Henter søgeresultater fra KB's DR-arkiv-API. - Understøtter medietype-filtrering for 'ds.radio' og 'ds.tv'. - """ - encoded = urllib.parse.quote(search_term, safe='*') - media_filter = self._build_media_filter(media_type) - - url = ( - f"https://{self.ds_api_domain}/ds-api/bff/v1/proxy/search/?q={encoded}{media_filter}" - f"&facet=false&start={start_index}&sort={urllib.parse.quote(sort_option)}&rows={rows}" - f"&fq=startTime:[{year_start}-12-31T23:00:00.000Z TO {year_end}-12-31T22:59:59.999Z]" - f"&fq=temporal_start_month:{month_number}" - ) - - headers = { - 'Accept': 'application/json, text/plain, */*', - 'Accept-Encoding': 'gzip, deflate, br, zstd', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Host': self.ds_api_domain, - 'Referer': f'https://{self.kb_domain}/find-materiale/dr-arkivet/find', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0' - } - - if self.auth_cookie: - headers['Cookie'] = f"Authorization={self.auth_cookie}" - - try: - response = self.client.get(url, headers=headers) - response.raise_for_status() - return response.json() - except requests.HTTPError as e: - print(f"HTTP {response.status_code} ved forespørgsel til søge-API: {e}") - return None - except requests.RequestException as e: - print(f"Forespørgsel til søge-API mislykkedes: {e}") - return None - except json.JSONDecodeError: - print("Kunne ikke parse JSON-respons fra søge-API.") - return None - - def _build_media_filter(self, media_type): - """Bygger media filter strengen baseret på media type.""" - if media_type in ("ds.radio", "ds.tv"): - return f"&fq=origin%3A%22{media_type}%22" - return "" - - def parse_search_response(self, response_data): - """ - Parser JSON-streng til Python-objekt. - Returnerer None hvis input er ugyldigt. - """ - try: - return json.loads(response_data) if response_data else None - except json.JSONDecodeError as e: - print(f"Kunne ikke parse søge-respons: {e}") - return None - - def download_media(self, media_url, filename, download_path="video_files"): - """Downloader medie fra en URL og gemmer det i den specificerede sti.""" - if not media_url: - print(" Download skipped: No media URL provided.") - return None # Return None to indicate failure/skip - - try: - if not os.path.exists(download_path): - os.makedirs(download_path) - - filepath = os.path.join(download_path, filename) - - print(f" Downloading {media_url} to {filepath}...") - response = self.client.get(media_url, stream=True) - response.raise_for_status() - - with open(filepath, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) - print(f" Successfully downloaded {filepath}") - return filepath # Return the path to the downloaded file - except requests.RequestException as e: - print(f" Failed to download {media_url}: {e}") - return None - except IOError as e: - print(f" Failed to save file {filepath}: {e}") - return None - except Exception as e: - print(f" An unexpected error occurred during download: {e}") - return None - - def extract_audio(self, input_filepath, output_filename, output_path="audio_files"): - """Extract audio from a local media file using PyAV. - Saves the audio as an MP3 file. - """ - if not input_filepath or not os.path.exists(input_filepath): - print(f" Audio extraction skipped: Input file not provided or does not exist: {input_filepath}") - return False - - try: - if not os.path.exists(output_path): - os.makedirs(output_path) - - base, ext = os.path.splitext(output_filename) - if ext.lower() != ".mp3": - output_filename = base + ".mp3" - - output_filepath = os.path.join(output_path, output_filename) - - print(f" Attempting to extract audio using PyAV.") - print(f" Input file: {input_filepath}") - print(f" Output file: {output_filepath}") - - # Use PyAV to extract audio - import av - - # Open the input file - input_container = av.open(input_filepath) - - # Create the output container - output_container = av.open(output_filepath, mode='w') - - # Add an audio stream to the output - output_stream = output_container.add_stream('mp3') - - # Process the input audio - for frame in input_container.decode(audio=0): - # Encode the frame - packet = output_stream.encode(frame) - if packet: - output_container.mux(packet) - - # Flush any remaining packets - packet = output_stream.encode(None) - if packet: - output_container.mux(packet) - - # Close the containers - output_container.close() - input_container.close() - - print(f" Successfully extracted audio to {output_filepath}") - return output_filepath # Return the path to the extracted audio file - - except Exception as e: - print(f" An unexpected error occurred during audio extraction from {input_filepath}: {e}") - return False - - def split_audio(self, audio_path: str, segments: list[dict]): - """Splits the audio file into segments based on the start and end times.""" - try: - print(f"Loading audio file for splitting: {audio_path}") - print(f"Using device: {device.type}") - y, sr = librosa.load(audio_path, sr=None) # Load with original sample rate - print(f"Original sample rate: {sr} Hz") - - # Target sample rate for SNAC - target_sr = 24000 - - # Convert to tensor for processing - waveform = torch.from_numpy(y).float() - - # Use torchaudio for resampling - if sr != target_sr: - print(f"Resampling from {sr} Hz to {target_sr} Hz using torchaudio") - resampler = T.Resample(orig_freq=sr, new_freq=target_sr) - waveform = resampler(waveform) - sr = target_sr - - # Split the audio into segments - chunks = [] - for segment in segments: - # Convert time to samples - start_time = segment["start"] - end_time = segment["end"] - start_sample = int(start_time * sr) - end_sample = int(end_time * sr) - text = segment["text"] - - print(f"Processing segment: {start_time:.2f}s - {end_time:.2f}s") - - # Make sure we don't go out of bounds - if start_sample >= len(waveform): - print(f"Warning: Start sample {start_sample} exceeds audio length {len(waveform)}") - continue - - end_sample = min(end_sample, len(waveform)) - - # Extract segment - chunk = waveform[start_sample:end_sample] - - # Format tensor exactly as in the example: - # 1. First unsqueeze to make it [1, length] - # 2. Then unsqueeze again to make it [1, 1, length] - chunk_tensor = chunk.unsqueeze(0).unsqueeze(0).to(device) - - with torch.inference_mode(): - print(f"Encoding segment with SNAC, waveform shape: {chunk_tensor.shape}") - codes = snac_model.encode(chunk_tensor) - print(f"Generated codes with shape: {codes.shape if hasattr(codes, 'shape') else 'N/A'}") - - all_codes = [] - for i in range(codes[0].shape[1]): - all_codes.append(codes[0][0][i].item()+128266) - all_codes.append(codes[1][0][2*i].item()+128266+4096) - all_codes.append(codes[2][0][4*i].item()+128266+(2*4096)) - all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096)) - all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096)) - all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096)) - all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096)) - - chunks.append({"text": text.strip(), "all_codes": all_codes, "audio_duration": end_time - start_time}) - - return chunks - - except Exception as e: - print(f"Error in split_audio: {e}") - import traceback - traceback.print_exc() - return [] - -if __name__ == "__main__": - kb = ApiService() - kb.authenticate(lambda: print("Autentificering gennemført")) - # iterate over all pages of search results up - # months = [1,2,3,4,5,6,7,8,9,10,11,12] - # years = [2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025] - # for year in years: - # for month in months: - # total_results = kb.fetch_search_results(media_type="ds.tv", start_index=0, rows=10, year_start=year, year_end=year+1, month_number=month)["response"]["numFound"] - # print(f"Total results: {total_results}") - # - # total_pages = total_results // 100 - # for page in range(1, total_pages): - # print(f"Fetching page {page} of {total_pages}... {year} {month}") - # search_results = kb.fetch_search_results(media_type="ds.tv", start_index=page*100, rows=100, year_start=year, year_end=year+1, month_number=month) - # - - - # if search_results and isinstance(search_results, dict): - # # Access the nested 'docs' list within 'response' - # response_dict = search_results.get("response") - # if response_dict and isinstance(response_dict, dict): - # results_list = response_dict.get("docs") - # else: - # results_list = None - # - # if results_list is not None and isinstance(results_list, list): - # print(f"Processing {len(results_list)} results...") - # # list of entry_ids not in the database - # ready_to_add = [] - # for result in results_list: - # if isinstance(result, dict) and "kaltura_id" in result: - # entry_id = result["kaltura_id"] - # # Check if the entry_id is already in the database if not then insert it - # if not collection.find_one({"kaltura_id": entry_id}): - # ready_to_add.append({"kaltura_id": entry_id, "year": year, "month": month}) - # else: - # print(f"Entry ID {entry_id} already exists in the database. Skipping...") - # - # # batch adds - # if len(ready_to_add) > 0: - # collection.insert_many(ready_to_add) - # print(f"Inserted {len(ready_to_add)} new entry IDs into the database.") - # else: - # print("No new entry IDs to insert.") - # print(f"Fetching Kaltura data for entry ID: {entry_id}...") - - # Get all documents from the collection that does not have a "transcription" field - documents = collection.find({"transcription": {"$exists": False}}) - - for document in documents: - print(document) - entry_id = document["kaltura_id"] - kaltura_data_str = kb.fetch_kaltura_data(entry_id) - print(f" Kaltura data: {kaltura_data_str}") - - if kaltura_data_str: - # Extract the stream link using the existing method - media_url = kb.extract_media_url_from_kaltura_response(kaltura_data_str) - if media_url: - print(f" Stream link for {entry_id}: {media_url}") - - # Step 1: Download the MP4 file - # Construct a filename for the MP4, e.g., kaltura_id.mp4 - # The file extension is already part of the media_url generation logic or defaults to mp4 - mp4_filename = f"{entry_id}.{media_url.split('.')[-1].split('?')[0] if '.' in media_url else 'mp4'}" - downloaded_mp4_path = kb.download_media(media_url, mp4_filename, download_path="downloads") - - if downloaded_mp4_path: - # Step 2: Convert the downloaded MP4 to MP3 - output_audio_filename = f"{entry_id}.mp3" # Output as mp3 - extracted_audio_path = kb.extract_audio(downloaded_mp4_path, output_audio_filename, output_path="audio_files") - - # Step 3: Transcribe the audio only if extraction was successful - if extracted_audio_path: - segments, info = batched_model.transcribe(extracted_audio_path, batch_size=16) - - print(f"Info: {info}") - segments_list = [] - for segment in segments: - print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) - segments_list.append({"start": segment.start, "end": segment.end, "text": segment.text}) - - # split the audio into the segments - chunks = kb.split_audio(extracted_audio_path, segments_list) - - # save the chunks to the database - collection.update_one({"kaltura_id": entry_id}, {"$set": {"chunks": chunks}}) - - print(f"Transcription saved to the database for {entry_id}") - - # Step 5: Delete the MP4 and MP3 files - os.remove(downloaded_mp4_path) - os.remove(extracted_audio_path) - else: - print(f"Skipping transcription for {entry_id} because audio extraction failed.") - - else: - print(f" Skipping audio extraction for {entry_id} because MP4 download failed.") \ No newline at end of file diff --git a/src/axolotl/utils/callbacks/orph3.yml b/src/axolotl/utils/callbacks/orph3.yml deleted file mode 100644 index d27ed8bca..000000000 --- a/src/axolotl/utils/callbacks/orph3.yml +++ /dev/null @@ -1,57 +0,0 @@ -base_model: syvai/tts-v1-pretrained -# Automatically upload checkpoint and final model to HF -hub_model_id: syvai/tts-v0.3-finetuned - -plugins: - - axolotl.integrations.liger.LigerPlugin -liger_rope: true -liger_rms_norm: true -liger_glu_activation: true -liger_fused_linear_cross_entropy: true - -datasets: - - path: syvai/zac-coral-tts - type: - - path: syvai/zac-dk-voice-pro - type: - - path: syvai/zac-dk-voice-single-speaker - type: -dataset_prepared_path: last_run_prepared -val_set_size: 0.01 -eval_sample_packing: False -output_dir: ./outputs/finetuned - -sequence_len: 8196 -sample_packing: true -pad_to_sequence_len: true - -wandb_project: orph -wandb_entity: -wandb_watch: -wandb_name: -wandb_log_model: - -gradient_accumulation_steps: 16 -micro_batch_size: 2 -num_epochs: 3 -optimizer: adamw_torch_fused -lr_scheduler: cosine -learning_rate: 2e-5 - -bf16: auto -tf32: false - -gradient_checkpointing: true -gradient_checkpointing_kwargs: - use_reentrant: false -resume_from_checkpoint: -logging_steps: 1 -flash_attention: true - -warmup_steps: 3 -evals_per_epoch: 5 -saves_per_epoch: 5 -weight_decay: 0.05 - -special_tokens: - pad_token: \ No newline at end of file