RL datasets: warn and drop unsalvageable over-length prompts post-truncate; add post-truncate filter; support alias config key 'excess_token_handling'

This commit is contained in:
mhenrhcsen
2025-08-12 20:37:41 +02:00
parent 618b008e36
commit f5a3e3529e
8 changed files with 844 additions and 27 deletions

15
transscribe.py Normal file
View File

@@ -0,0 +1,15 @@
import pymongo
MONGO_URI = "mongodb://root:9AsYmXYKmYLHcNsShmCb3L5DZMXH77rQ9GBRxm0HKownNWLwdzH9dW7zhPG9mpuR@46.4.101.229:8281/?directConnection=true"
COLLECTION_NAME = "tts_data"
client = pymongo.MongoClient(MONGO_URI)
db = client["tts_data"]
collection = db[COLLECTION_NAME]
# Get all documents from the collection that does not have a "transcription" field
documents = collection.find({"transcription": {"$exists": False}})
for document in documents:
print(document)
break