diff --git a/gen_tts_dataset.py b/gen_tts_dataset.py new file mode 100644 index 0000000..3fa1632 --- /dev/null +++ b/gen_tts_dataset.py @@ -0,0 +1,1352 @@ +#!/usr/bin/env python3 +"""Generate TTS normalization LoRA training dataset. + +Language is EXPLICITLY tagged [en] or [pl] by the TTS client — no detection/inference. +Currency, units, and content are language-independent (Polish text can have dollars, +English text can have złoty). + +Output: JSONL in ShareGPT conversation format for Unsloth/Qwen2.5-Instruct. +""" + +import json +import random +import re +from pathlib import Path +from num2words import num2words + +SEED = 42 +OUTPUT = Path("tts_norm_dataset.jsonl") + +SYSTEM = ( + "You are a TTS text preprocessor. The input begins with a language tag [en] or [pl] " + "that specifies the target speech language. Normalize the text for natural speech synthesis: " + "expand numbers to words in the tagged language, handle acronyms (spell out letter-acronyms " + "with spaces between letters, keep pronounceable acronyms intact), fix spelling errors, " + "convert symbols and URLs to spoken form, and strip markdown formatting. " + "Output only the normalized text without the language tag." +) + +# ────────────────────────────────────────────────────────────── +# NATO PHONETIC ALPHABET +# ────────────────────────────────────────────────────────────── +NATO = { + 'A': 'Alfa', 'B': 'Bravo', 'C': 'Charlie', 'D': 'Delta', + 'E': 'Echo', 'F': 'Foxtrot', 'G': 'Golf', 'H': 'Hotel', + 'I': 'India', 'J': 'Juliett', 'K': 'Kilo', 'L': 'Lima', + 'M': 'Mike', 'N': 'November', 'O': 'Oscar', 'P': 'Papa', + 'Q': 'Quebec', 'R': 'Romeo', 'S': 'Sierra', 'T': 'Tango', + 'U': 'Uniform', 'V': 'Victor', 'W': 'Whiskey', 'X': 'X-ray', + 'Y': 'Yankee', 'Z': 'Zulu', +} + +# ────────────────────────────────────────────────────────────── +# ACRONYMS — spelled letter-by-letter +# ────────────────────────────────────────────────────────────── +SPELLED_ACRONYMS = { + 'API': 'A P I', 'CPU': 'C P U', 'GPU': 'G P U', 'RAM': 'R A M', + 'SSD': 'S S D', 'HTTP': 'H T T P', 'HTTPS': 'H T T P S', + 'URL': 'U R L', 'HTML': 'H T M L', 'CSS': 'C S S', + 'FBI': 'F B I', 'CIA': 'C I A', 'NFL': 'N F L', 'NBA': 'N B A', + 'MVP': 'M V P', 'CEO': 'C E O', 'CFO': 'C F O', 'CTO': 'C T O', + 'AI': 'A I', 'ML': 'M L', 'OS': 'O S', 'UI': 'U I', 'UX': 'U X', + 'IP': 'I P', 'VPN': 'V P N', 'DNS': 'D N S', 'SSH': 'S S H', + 'FTP': 'F T P', 'SQL': 'S Q L', 'XML': 'X M L', 'JSON': 'J S O N', + 'AWS': 'A W S', 'USB': 'U S B', 'HDMI': 'H D M I', + 'LCD': 'L C D', 'LED': 'L E D', 'PDF': 'P D F', 'FAQ': 'F A Q', + 'DIY': 'D I Y', 'ETA': 'E T A', 'FYI': 'F Y I', 'ASAP': 'A S A P', + 'GPS': 'G P S', 'ATM': 'A T M', 'BBC': 'B B C', 'CNN': 'C N N', + 'EU': 'E U', 'UN': 'U N', 'UK': 'U K', 'US': 'U S', + 'RGB': 'R G B', 'TCP': 'T C P', 'UDP': 'U D P', + 'IDE': 'I D E', 'SDK': 'S D K', 'CI': 'C I', 'CD': 'C D', + 'PR': 'P R', 'QA': 'Q A', 'IT': 'I T', 'HR': 'H R', + 'PKP': 'P K P', 'PZU': 'P Z U', 'ZUS': 'Z U S', 'NFZ': 'N F Z', + 'PKO': 'P K O', 'TVP': 'T V P', 'TVN': 'T V N', + 'KRS': 'K R S', 'NIP': 'N I P', 'VAT': 'V A T', + 'ZTM': 'Z T M', 'MPK': 'M P K', 'GUS': 'G U S', 'PGE': 'P G E', + 'NBP': 'N B P', 'PKB': 'P K B', 'UE': 'U E', 'ONZ': 'O N Z', + 'RPO': 'R P O', 'NIK': 'N I K', 'SLD': 'S L D', + 'PIS': 'P I S', 'PSL': 'P S L', 'PCK': 'P C K', + 'AGH': 'A G H', 'UJ': 'U J', 'UW': 'U W', 'PWN': 'P W N', + 'IPN': 'I P N', 'ABW': 'A B W', 'CBA': 'C B A', + 'PIT': 'P I T', 'CIT': 'C I T', 'PESEL': 'P E S E L', + 'TTS': 'T T S', 'NPC': 'N P C', 'RPG': 'R P G', 'PVP': 'P V P', + 'DPS': 'D P S', 'MMO': 'M M O', 'RNG': 'R N G', 'AFK': 'A F K', + 'LLM': 'L L M', 'NLP': 'N L P', 'OCR': 'O C R', 'ORM': 'O R M', +} + +# Acronyms pronounced as words (not spelled) +WORD_ACRONYMS = [ + 'NASA', 'NATO', 'LASER', 'RADAR', 'SCUBA', 'PIN', 'SIM', 'BIOS', + 'CAPTCHA', 'AWOL', 'GIF', 'JPEG', 'GOPR', 'IKEA', 'FIAT', + 'UNICEF', 'UNESCO', 'AIDS', 'COVID', 'BASIC', 'SWIFT', 'DART', + 'RUST', 'AJAX', 'LIDAR', 'MODEM', 'PIXEL', +] + +# Words that LOOK like acronyms (all-caps) but are just regular words +NOT_ACRONYMS = [ + 'NVIDIA', 'HELLO', 'STOP', 'WARNING', 'ERROR', 'DANGER', + 'IMPORTANT', 'NOTE', 'URGENT', 'ATTENTION', 'WELCOME', 'EXIT', + 'OPEN', 'CLOSE', 'START', 'FINISH', 'UWAGA', 'STOP', 'WEJŚCIE', + 'WYJŚCIE', 'ZAMKNIĘTE', 'OTWARTE', +] + +# ────────────────────────────────────────────────────────────── +# SPELLING ERRORS +# ────────────────────────────────────────────────────────────── +TYPOS_EN = { + 'recieve': 'receive', 'definately': 'definitely', 'occured': 'occurred', + 'seperate': 'separate', 'accomodate': 'accommodate', 'neccessary': 'necessary', + 'wierd': 'weird', 'occassion': 'occasion', 'concious': 'conscious', + 'enviroment': 'environment', 'goverment': 'government', 'independant': 'independent', + 'knowlege': 'knowledge', 'langauge': 'language', 'maintainance': 'maintenance', + 'millenium': 'millennium', 'noticable': 'noticeable', 'persistant': 'persistent', + 'publically': 'publicly', 'recomend': 'recommend', 'refering': 'referring', + 'successfull': 'successful', 'suprise': 'surprise', 'tommorow': 'tomorrow', + 'untill': 'until', 'wether': 'whether', 'wich': 'which', + 'thier': 'their', 'teh': 'the', 'adn': 'and', 'hte': 'the', + 'becuase': 'because', 'beleive': 'believe', 'calender': 'calendar', + 'collegue': 'colleague', 'comittee': 'committee', 'dissapoint': 'disappoint', + 'embarass': 'embarrass', 'existance': 'existence', 'foriegn': 'foreign', + 'gurantee': 'guarantee', 'harrass': 'harass', 'imediately': 'immediately', + 'jewlery': 'jewelry', 'judgement': 'judgment', 'liason': 'liaison', +} + +TYPOS_PL = { + 'rząd': 'rząd', # correct, no-op example + 'wziąść': 'wziąć', 'włanczyć': 'włączyć', 'poszłem': 'poszedłem', + 'pokarze': 'pokaże', 'napewno': 'na pewno', 'wogóle': 'w ogóle', + 'niewiem': 'nie wiem', 'przedewszystkim': 'przede wszystkim', + 'conajmniej': 'co najmniej', 'natomist': 'natomiast', + 'ponieważ': 'ponieważ', # correct + 'żadko': 'rzadko', 'bynajmiej': 'bynajmniej', + 'jakby': 'jakby', # correct + 'pomóżcie': 'pomóżcie', # correct + 'włożyłam': 'włożyłam', # correct + 'spróbój': 'spróbuj', 'wyrzygnąć': 'wyrzucić', + 'przyjeżdzać': 'przyjeżdżać', 'żółtko': 'żółtko', # correct + 'gżegżółka': 'gżegżółka', # correct — hard word + 'sprawdźić': 'sprawdzić', 'ząmówić': 'zamówić', + 'orginalny': 'oryginalny', 'symultaniczny': 'symultaniczny', + 'odzwyczaić': 'odzwyczaić', # correct + 'czterysta': 'czterysta', # correct + 'rzentelny': 'rzetelny', 'wchodzić': 'wchodzić', # correct + 'porządże': 'porządze', # intentional trap +} + +# ────────────────────────────────────────────────────────────── +# SENTENCE TEMPLATES (with {} placeholders) +# ────────────────────────────────────────────────────────────── +NUM_TEMPLATES_EN = [ + "There are {} items in the queue.", + "The building has {} floors.", + "She scored {} points in the final round.", + "Approximately {} people attended the event.", + "{} units were shipped yesterday.", + "The file contains {} lines of code.", + "He waited {} minutes for the train.", + "The population reached {} last year.", + "We need {} more signatures.", + "The distance is {} meters.", + "Page {} of the document.", + "Chapter {} covers advanced topics.", + "Flight {} departs at noon.", + "Room {} is on the third floor.", + "The team completed {} sprints this quarter.", +] + +NUM_TEMPLATES_PL = [ + "W kolejce jest {} elementów.", + "Budynek ma {} pięter.", + "Zdobyła {} punktów w finale.", + "Na wydarzeniu pojawiło się około {} osób.", + "Wysłano {} jednostek wczoraj.", + "Plik zawiera {} linii kodu.", + "Czekał {} minut na pociąg.", + "Populacja osiągnęła {} w zeszłym roku.", + "Potrzebujemy jeszcze {} podpisów.", + "Odległość wynosi {} metrów.", + "Strona {} dokumentu.", + "Rozdział {} obejmuje zaawansowane tematy.", + "Lot {} odlatuje w południe.", + "Pokój {} jest na trzecim piętrze.", + "Zespół ukończył {} sprintów w tym kwartale.", +] + +CURRENCY_TEMPLATES_EN = [ + "The total is {}.", + "She paid {}.", + "The budget was set at {}.", + "It costs {} per unit.", + "They raised {} for charity.", + "The invoice shows {}.", + "Repairs will cost approximately {}.", + "The price dropped to {}.", +] + +CURRENCY_TEMPLATES_PL = [ + "Łącznie to {}.", + "Zapłaciła {}.", + "Budżet ustalono na {}.", + "Kosztuje {} za sztukę.", + "Zebrali {} na cele charytatywne.", + "Faktura pokazuje {}.", + "Naprawa będzie kosztować około {}.", + "Cena spadła do {}.", +] + +ACRONYM_TEMPLATES_EN = [ + "The {} system is down.", + "Check the {} settings.", + "We're migrating to {}.", + "The {} update is ready.", + "{} performance improved significantly.", + "Connect via {} to the server.", + "The {} team approved the request.", +] + +ACRONYM_TEMPLATES_PL = [ + "System {} nie działa.", + "Sprawdź ustawienia {}.", + "Migrujemy do {}.", + "Aktualizacja {} jest gotowa.", + "Wydajność {} znacząco się poprawiła.", + "Połącz się przez {} z serwerem.", + "Zespół {} zatwierdził wniosek.", +] + +# ────────────────────────────────────────────────────────────── +# HELPERS +# ────────────────────────────────────────────────────────────── + +def n2w(n, lang): + """Number to words via num2words. Handles negative bug in PL.""" + if isinstance(n, (int, float)) and n < 0: + return f"minus {num2words(abs(n), lang=lang)}" + return num2words(n, lang=lang) + +def n2w_ordinal(n, lang): + """Ordinal number to words.""" + return num2words(n, to='ordinal', lang=lang) + +def en_currency_name(symbol, n): + """Currency name in English for any symbol.""" + names = { + '$': ('dollar', 'dollars'), + '€': ('euro', 'euros'), + 'zł': ('zloty', 'zloty'), + '£': ('pound', 'pounds'), + '¥': ('yen', 'yen'), + 'CHF': ('Swiss franc', 'Swiss francs'), + 'kr': ('krona', 'kronor'), + } + sing, plur = names.get(symbol, (symbol, symbol)) + return sing if abs(n) == 1 else plur + +def pl_currency_name(symbol, n): + """Currency name in Polish for any symbol — with declension.""" + last_d = abs(n) % 10 + last_2d = abs(n) % 100 + + def pick(one, few, many): + if abs(n) == 1: + return one + elif 2 <= last_d <= 4 and not (12 <= last_2d <= 14): + return few + else: + return many + + table = { + '$': ('dolar', 'dolary', 'dolarów'), + '€': ('euro', 'euro', 'euro'), + 'zł': ('złoty', 'złote', 'złotych'), + '£': ('funt', 'funty', 'funtów'), + '¥': ('jen', 'jeny', 'jenów'), + 'CHF': ('frank szwajcarski', 'franki szwajcarskie', 'franków szwajcarskich'), + 'kr': ('korona', 'korony', 'koron'), + } + forms = table.get(symbol, (symbol, symbol, symbol)) + return pick(*forms) + +def fmt_currency(n, symbol, lang): + """Format currency amount as spoken text.""" + word_n = n2w(n, lang) + if lang == 'en': + return f"{word_n} {en_currency_name(symbol, n)}" + else: + return f"{word_n} {pl_currency_name(symbol, n)}" + +def make(raw, norm): + """Make one ShareGPT conversation example.""" + return { + "conversations": [ + {"from": "system", "value": SYSTEM}, + {"from": "human", "value": raw}, + {"from": "gpt", "value": norm}, + ] + } + +# ────────────────────────────────────────────────────────────── +# GENERATORS +# ────────────────────────────────────────────────────────────── + +def gen_numbers(count=200): + """Cardinal numbers in sentence context.""" + examples = [] + ranges = [ + (0, 20), (21, 99), (100, 999), (1000, 9999), + (10000, 99999), (100000, 999999), + ] + for _ in range(count): + lo, hi = random.choice(ranges) + n = random.randint(lo, hi) + + # English + tpl = random.choice(NUM_TEMPLATES_EN) + examples.append(make( + f"[en] {tpl.format(n)}", + tpl.format(n2w(n, 'en')) + )) + + # Polish + tpl = random.choice(NUM_TEMPLATES_PL) + examples.append(make( + f"[pl] {tpl.format(n)}", + tpl.format(n2w(n, 'pl')) + )) + + return examples + + +def gen_negatives_decimals(count=80): + """Negative numbers and decimals.""" + examples = [] + for _ in range(count): + # Negative integers + n = -random.randint(1, 500) + examples.append(make( + f"[en] The temperature is {n} degrees.", + f"The temperature is {n2w(n, 'en')} degrees." + )) + examples.append(make( + f"[pl] Temperatura wynosi {n} stopni.", + f"Temperatura wynosi {n2w(n, 'pl')} stopni." + )) + + # Decimals + whole = random.randint(0, 999) + frac = random.randint(1, 99) + dec_str = f"{whole}.{frac}" + dec_val = float(dec_str) + + en_spoken = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in str(frac))}" + pl_spoken = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in str(frac))}" + + examples.append(make( + f"[en] The measurement reads {dec_str}.", + f"The measurement reads {en_spoken}." + )) + examples.append(make( + f"[pl] Pomiar wskazuje {dec_str}.", + f"Pomiar wskazuje {pl_spoken}." + )) + + return examples + + +def gen_ordinals(count=80): + """Ordinal numbers.""" + examples = [] + for _ in range(count): + n = random.randint(1, 100) + suffix_raw = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th') + if 11 <= n % 100 <= 13: + suffix_raw = 'th' + + examples.append(make( + f"[en] This is the {n}{suffix_raw} attempt.", + f"This is the {n2w_ordinal(n, 'en')} attempt." + )) + examples.append(make( + f"[pl] To jest {n}. próba.", + f"To jest {n2w_ordinal(n, 'pl')} próba." + )) + return examples + + +def gen_percentages(count=60): + """Percentage expressions.""" + examples = [] + for _ in range(count): + n = random.choice([ + random.randint(0, 100), + round(random.uniform(0, 100), 1), + ]) + + if isinstance(n, float): + whole = int(n) + frac = str(n).split('.')[1] + en_w = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in frac)}" + pl_w = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in frac)}" + else: + en_w = n2w(n, 'en') + pl_w = n2w(n, 'pl') + + examples.append(make( + f"[en] The success rate is {n}%.", + f"The success rate is {en_w} percent." + )) + examples.append(make( + f"[pl] Wskaźnik sukcesu wynosi {n}%.", + f"Wskaźnik sukcesu wynosi {pl_w} procent." + )) + return examples + + +def gen_dates(count=100): + """Date expressions in various formats.""" + examples = [] + + months_en = ['January', 'February', 'March', 'April', 'May', 'June', + 'July', 'August', 'September', 'October', 'November', 'December'] + months_pl_gen = ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca', + 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'] + + for _ in range(count): + day = random.randint(1, 28) + month = random.randint(1, 12) + year = random.randint(1990, 2030) + + # EN: "05/25/2026" or "25.05.2026" or "2026-05-25" + en_day_ord = n2w_ordinal(day, 'en') + en_year = n2w(year, 'en') + en_spoken = f"{months_en[month-1]} {en_day_ord}, {en_year}" + + fmt = random.choice(['us', 'eu', 'iso']) + if fmt == 'us': + raw_date = f"{month:02d}/{day:02d}/{year}" + elif fmt == 'eu': + raw_date = f"{day:02d}.{month:02d}.{year}" + else: + raw_date = f"{year}-{month:02d}-{day:02d}" + + examples.append(make( + f"[en] The deadline is {raw_date}.", + f"The deadline is {en_spoken}." + )) + + # PL: "25.05.2026" or "25 maja 2026" + pl_day_ord = n2w_ordinal(day, 'pl') + pl_year = n2w(year, 'pl') + pl_spoken = f"{pl_day_ord} {months_pl_gen[month-1]} {pl_year}" + + raw_date_pl = f"{day:02d}.{month:02d}.{year}" + examples.append(make( + f"[pl] Termin to {raw_date_pl}.", + f"Termin to {pl_spoken}." + )) + + return examples + + +def gen_times(count=80): + """Time expressions — 12h/24h.""" + examples = [] + for _ in range(count): + h24 = random.randint(0, 23) + m = random.randint(0, 59) + + # 24h format + time_str = f"{h24:02d}:{m:02d}" + en_h = n2w(h24, 'en') + en_m = n2w(m, 'en') if m != 0 else "" + en_spoken = f"{en_h} {en_m}".strip() if m != 0 else f"{en_h} hundred" if h24 > 0 else "midnight" + + pl_h = n2w(h24, 'pl') + pl_m = n2w(m, 'pl') if m != 0 else "" + pl_spoken = f"{pl_h} {pl_m}".strip() + + examples.append(make( + f"[en] The meeting is at {time_str}.", + f"The meeting is at {en_spoken}." + )) + examples.append(make( + f"[pl] Spotkanie jest o {time_str}.", + f"Spotkanie jest o {pl_spoken}." + )) + + # 12h format (EN only common) + h12 = h24 % 12 or 12 + ampm = "AM" if h24 < 12 else "PM" + time_12 = f"{h12}:{m:02d} {ampm}" + en_12_spoken = f"{n2w(h12, 'en')} {n2w(m, 'en') if m else ''} {ampm}".strip() + examples.append(make( + f"[en] Alarm set for {time_12}.", + f"Alarm set for {en_12_spoken}." + )) + + return examples + + +def gen_currency(count=120): + """Currency — any currency symbol in any language.""" + examples = [] + symbols = ['$', '€', 'zł', '£'] + prefixed = {'$', '€', '£'} # symbol before number + postfixed = {'zł'} # symbol after number + + for _ in range(count): + sym = random.choice(symbols) + n = random.choice([ + random.randint(1, 99), + random.randint(100, 9999), + random.randint(10000, 999999), + ]) + + if sym in prefixed: + raw_amount = f"{sym}{n:,}" + else: + raw_amount = f"{n:,} {sym}" + + # English + tpl = random.choice(CURRENCY_TEMPLATES_EN) + examples.append(make( + f"[en] {tpl.format(raw_amount)}", + tpl.format(fmt_currency(n, sym, 'en')) + )) + + # Polish + tpl = random.choice(CURRENCY_TEMPLATES_PL) + examples.append(make( + f"[pl] {tpl.format(raw_amount)}", + tpl.format(fmt_currency(n, sym, 'pl')) + )) + + # Cents/grosze + for _ in range(20): + whole = random.randint(1, 999) + cents = random.randint(1, 99) + sym = random.choice(symbols) + + if sym in prefixed: + raw = f"{sym}{whole}.{cents:02d}" + else: + raw = f"{whole}.{cents:02d} {sym}" + + en_spoken = f"{n2w(whole, 'en')} {en_currency_name(sym, whole)} and {n2w(cents, 'en')} cents" + pl_spoken = f"{n2w(whole, 'pl')} {pl_currency_name(sym, whole)} i {n2w(cents, 'pl')} groszy" + + examples.append(make(f"[en] Total: {raw}", f"Total: {en_spoken}")) + examples.append(make(f"[pl] Łącznie: {raw}", f"Łącznie: {pl_spoken}")) + + return examples + + +def gen_units(count=80): + """Units and measurements.""" + units = { + 'km': ('kilometers', 'kilometrów'), + 'm': ('meters', 'metrów'), + 'cm': ('centimeters', 'centymetrów'), + 'mm': ('millimeters', 'milimetrów'), + 'kg': ('kilograms', 'kilogramów'), + 'g': ('grams', 'gramów'), + 'mg': ('milligrams', 'miligramów'), + 'l': ('liters', 'litrów'), + 'ml': ('milliliters', 'mililitrów'), + 'km/h': ('kilometers per hour', 'kilometrów na godzinę'), + 'mph': ('miles per hour', 'mil na godzinę'), + 'GB': ('gigabytes', 'gigabajtów'), + 'MB': ('megabytes', 'megabajtów'), + 'TB': ('terabytes', 'terabajtów'), + 'GHz': ('gigahertz', 'gigaherców'), + 'MHz': ('megahertz', 'megaherców'), + 'kW': ('kilowatts', 'kilowatów'), + 'W': ('watts', 'watów'), + 'V': ('volts', 'woltów'), + 'A': ('amperes', 'amperów'), + } + + templates_en = [ + "The speed is {} {}.", + "It weighs {} {}.", + "The capacity is {} {}.", + "Measured {} {}.", + "Maximum: {} {}.", + ] + templates_pl = [ + "Prędkość wynosi {} {}.", + "Waży {} {}.", + "Pojemność to {} {}.", + "Zmierzono {} {}.", + "Maksimum: {} {}.", + ] + + examples = [] + for _ in range(count): + unit, (en_name, pl_name) = random.choice(list(units.items())) + n = random.choice([ + random.randint(1, 99), + random.randint(100, 9999), + round(random.uniform(0.1, 99.9), 1), + ]) + + raw_n = str(n) + if isinstance(n, float): + whole = int(n) + frac = str(n).split('.')[1] + en_w = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in frac)}" + pl_w = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in frac)}" + else: + en_w = n2w(n, 'en') + pl_w = n2w(n, 'pl') + + tpl = random.choice(templates_en) + examples.append(make( + f"[en] {tpl.format(raw_n, unit)}", + tpl.format(en_w, en_name) + )) + + tpl = random.choice(templates_pl) + examples.append(make( + f"[pl] {tpl.format(raw_n, unit)}", + tpl.format(pl_w, pl_name) + )) + + return examples + + +def gen_temperatures(count=40): + """Temperature expressions.""" + examples = [] + for _ in range(count): + t = random.randint(-30, 45) + examples.append(make( + f"[en] Current temperature: {t}°C.", + f"Current temperature: {n2w(t, 'en')} degrees Celsius." + )) + examples.append(make( + f"[pl] Aktualna temperatura: {t}°C.", + f"Aktualna temperatura: {n2w(t, 'pl')} stopni Celsjusza." + )) + + f_temp = random.randint(0, 110) + examples.append(make( + f"[en] It's {f_temp}°F outside.", + f"It's {n2w(f_temp, 'en')} degrees Fahrenheit outside." + )) + + return examples + + +def gen_acronyms(count=150): + """Acronyms — spelled out vs pronounced as words.""" + examples = [] + spelled_items = list(SPELLED_ACRONYMS.items()) + + for _ in range(count): + # Spelled acronym in sentence + acr, spelled = random.choice(spelled_items) + + tpl = random.choice(ACRONYM_TEMPLATES_EN) + examples.append(make( + f"[en] {tpl.format(acr)}", + tpl.format(spelled) + )) + + tpl = random.choice(ACRONYM_TEMPLATES_PL) + examples.append(make( + f"[pl] {tpl.format(acr)}", + tpl.format(spelled) + )) + + # Word acronym (should NOT be spelled) + word_acr = random.choice(WORD_ACRONYMS) + examples.append(make( + f"[en] The {word_acr} project launched.", + f"The {word_acr} project launched." + )) + examples.append(make( + f"[pl] Projekt {word_acr} wystartował.", + f"Projekt {word_acr} wystartował." + )) + + # Not-acronym all-caps (should lowercase or preserve) + not_acr = random.choice(NOT_ACRONYMS) + examples.append(make( + f"[en] {not_acr}: read the instructions.", + f"{not_acr}: read the instructions." + )) + + return examples + + +def gen_nato(count=100): + """NATO phonetic alphabet — both encoding and decoding.""" + examples = [] + letters = list(NATO.keys()) + + for _ in range(count): + # Decode: "Alpha Bravo Charlie" → "ABC" + length = random.randint(2, 6) + chosen = random.sample(letters, length) + nato_str = ' '.join(NATO[c] for c in chosen) + letter_str = ''.join(chosen) + spelled_str = ' '.join(chosen) + + examples.append(make( + f"[en] Callsign: {nato_str}.", + f"Callsign: {spelled_str}." + )) + examples.append(make( + f"[pl] Znak wywoławczy: {nato_str}.", + f"Znak wywoławczy: {spelled_str}." + )) + + # Encode: spelled letters → NATO (reverse direction) + examples.append(make( + f"[en] Spell {letter_str} using NATO phonetic.", + f"{nato_str}." + )) + examples.append(make( + f"[pl] Przeliteruj {letter_str} alfabetem NATO.", + f"{nato_str}." + )) + + # Decode in running text + word = ''.join(random.sample(letters, random.randint(3, 5))) + nato_word = ' '.join(NATO[c] for c in word) + spelled_word = ' '.join(word) + + examples.append(make( + f"[en] The code is {nato_word}.", + f"The code is {spelled_word}." + )) + + return examples + + +def gen_spelling_corrections(count=80): + """Fix common spelling errors.""" + examples = [] + + en_items = [(bad, good) for bad, good in TYPOS_EN.items() if bad != good] + pl_items = [(bad, good) for bad, good in TYPOS_PL.items() if bad != good] + + templates_en = [ + "The {} was unexpected.", + "We need to {} the system.", + "It was {} to everyone.", + "The {} process failed.", + "Please {} the document.", + ] + templates_pl = [ + "To było {}.", + "Trzeba {} system.", + "Było to {} dla wszystkich.", + "Proces {} się nie powiódł.", + "Proszę {} dokument.", + ] + + for _ in range(count): + if en_items: + bad, good = random.choice(en_items) + tpl = random.choice(templates_en) + examples.append(make( + f"[en] {tpl.format(bad)}", + tpl.format(good) + )) + + if pl_items: + bad, good = random.choice(pl_items) + tpl = random.choice(templates_pl) + examples.append(make( + f"[pl] {tpl.format(bad)}", + tpl.format(good) + )) + + return examples + + +def gen_urls_emails(count=80): + """URLs and email addresses to spoken form.""" + examples = [] + + domains = ['github.com', 'google.com', 'example.org', 'openai.com', + 'huggingface.co', 'reddit.com', 'stackoverflow.com', + 'wikipedia.org', 'youtube.com', 'docs.python.org'] + paths = ['/docs', '/api/v2', '/user/settings', '/search?q=test', + '/releases/latest', '/issues/42', '/wiki/Main_Page'] + tlds = {'com': 'dot com', 'org': 'dot org', 'co': 'dot co', + 'net': 'dot net', 'pl': 'dot P L', 'dev': 'dot dev'} + users = ['john', 'anna', 'admin', 'support', 'info', 'kontakt'] + email_domains = ['gmail.com', 'outlook.com', 'firma.pl', 'example.org'] + + for _ in range(count): + # URL + domain = random.choice(domains) + path = random.choice(paths + ['']) + url = f"https://{domain}{path}" + + parts = domain.split('.') + tld = parts[-1] + name = ' dot '.join(parts[:-1]) + spoken_domain = f"{name} {tlds.get(tld, f'dot {tld}')}" + spoken_path = path.replace('/', ' slash ').replace('?', ' question mark ').replace('=', ' equals ').replace('#', ' hash ').strip() if path else '' + spoken_url = f"{spoken_domain} {spoken_path}".strip() + + examples.append(make( + f"[en] Visit {url} for details.", + f"Visit {spoken_url} for details." + )) + examples.append(make( + f"[pl] Odwiedź {url} po szczegóły.", + f"Odwiedź {spoken_url} po szczegóły." + )) + + # Email + user = random.choice(users) + edom = random.choice(email_domains) + email = f"{user}@{edom}" + eparts = edom.split('.') + etld = eparts[-1] + ename = ' dot '.join(eparts[:-1]) + spoken_email = f"{user} at {ename} {tlds.get(etld, f'dot {etld}')}" + + examples.append(make( + f"[en] Contact us at {email}.", + f"Contact us at {spoken_email}." + )) + examples.append(make( + f"[pl] Napisz do nas na {email}.", + f"Napisz do nas na {spoken_email}." + )) + + return examples + + +def gen_symbols(count=60): + """Symbol expansion.""" + symbol_map_en = { + '@': 'at', '&': 'and', '#': 'hash', '%': 'percent', + '+': 'plus', '=': 'equals', '/': 'slash', '\\': 'backslash', + '*': 'asterisk', '^': 'caret', '~': 'tilde', '|': 'pipe', + '<': 'less than', '>': 'greater than', + } + symbol_map_pl = { + '@': 'małpa', '&': 'i', '#': 'hash', '%': 'procent', + '+': 'plus', '=': 'równa się', '/': 'slash', '\\': 'backslash', + '*': 'gwiazdka', '^': 'daszek', '~': 'tylda', '|': 'kreska pionowa', + '<': 'mniejsze niż', '>': 'większe niż', + } + + examples = [] + for _ in range(count): + sym = random.choice(list(symbol_map_en.keys())) + examples.append(make( + f"[en] Press {sym} to continue.", + f"Press {symbol_map_en[sym]} to continue." + )) + examples.append(make( + f"[pl] Naciśnij {sym} aby kontynuować.", + f"Naciśnij {symbol_map_pl[sym]} aby kontynuować." + )) + + # Math expressions + for _ in range(20): + a = random.randint(1, 100) + b = random.randint(1, 100) + op = random.choice(['+', '-', '*', '/']) + op_en = {'+': 'plus', '-': 'minus', '*': 'times', '/': 'divided by'} + op_pl = {'+': 'plus', '-': 'minus', '*': 'razy', '/': 'podzielone przez'} + + examples.append(make( + f"[en] Calculate {a} {op} {b}.", + f"Calculate {n2w(a, 'en')} {op_en[op]} {n2w(b, 'en')}." + )) + examples.append(make( + f"[pl] Oblicz {a} {op} {b}.", + f"Oblicz {n2w(a, 'pl')} {op_pl[op]} {n2w(b, 'pl')}." + )) + + return examples + + +def gen_markdown(count=60): + """Strip markdown formatting.""" + examples = [] + md_pairs_en = [ + ("**important** update", "important update"), + ("the *quick* brown fox", "the quick brown fox"), + ("# Main Heading", "Main Heading"), + ("## Section Two", "Section Two"), + ("### Subsection", "Subsection"), + ("Check `config.json` file", "Check config dot json file"), + ("Use ```python\nprint()``` here", "Use print here"), + ("Visit [our site](https://example.com)", "Visit our site"), + ("- item one\n- item two", "item one, item two"), + ("1. first step\n2. second step", "first step, second step"), + ("> This is a quote", "This is a quote"), + ("~~deleted~~ text", "deleted text"), + ("__underline__ this", "underline this"), + ("***bold italic*** text", "bold italic text"), + ("some `inline code` here", "some inline code here"), + ] + md_pairs_pl = [ + ("**ważna** aktualizacja", "ważna aktualizacja"), + ("*szybki* brązowy lis", "szybki brązowy lis"), + ("# Główny Nagłówek", "Główny Nagłówek"), + ("## Sekcja Druga", "Sekcja Druga"), + ("Sprawdź plik `config.json`", "Sprawdź plik config dot json"), + ("Odwiedź [naszą stronę](https://example.com)", "Odwiedź naszą stronę"), + ("- element pierwszy\n- element drugi", "element pierwszy, element drugi"), + ("1. pierwszy krok\n2. drugi krok", "pierwszy krok, drugi krok"), + ("> To jest cytat", "To jest cytat"), + ("~~usunięty~~ tekst", "usunięty tekst"), + ("***pogrubiona kursywa*** tekst", "pogrubiona kursywa tekst"), + ] + + for _ in range(count): + raw, norm = random.choice(md_pairs_en) + examples.append(make(f"[en] {raw}", norm)) + + raw, norm = random.choice(md_pairs_pl) + examples.append(make(f"[pl] {raw}", norm)) + + return examples + + +def gen_phone_numbers(count=40): + """Phone number normalization — read as digit groups.""" + examples = [] + for _ in range(count): + # Polish phone: +48 XXX XXX XXX + g1 = random.randint(100, 999) + g2 = random.randint(100, 999) + g3 = random.randint(100, 999) + phone_pl = f"+48 {g1} {g2} {g3}" + spoken_digits = lambda n, lang: ' '.join(n2w(int(d), lang) for d in str(n)) + pl_spoken = f"plus czterdzieści osiem, {spoken_digits(g1, 'pl')}, {spoken_digits(g2, 'pl')}, {spoken_digits(g3, 'pl')}" + en_spoken = f"plus forty-eight, {spoken_digits(g1, 'en')}, {spoken_digits(g2, 'en')}, {spoken_digits(g3, 'en')}" + + examples.append(make( + f"[pl] Zadzwoń pod {phone_pl}.", + f"Zadzwoń pod {pl_spoken}." + )) + examples.append(make( + f"[en] Call {phone_pl}.", + f"Call {en_spoken}." + )) + + # US phone: (XXX) XXX-XXXX + a = random.randint(200, 999) + b = random.randint(100, 999) + c = random.randint(1000, 9999) + phone_us = f"({a}) {b}-{c}" + en_us = f"{spoken_digits(a, 'en')}, {spoken_digits(b, 'en')}, {spoken_digits(c, 'en')}" + + examples.append(make( + f"[en] Reach us at {phone_us}.", + f"Reach us at {en_us}." + )) + + return examples + + +def gen_versions_ips(count=40): + """Version numbers and IP addresses.""" + examples = [] + for _ in range(count): + # Version: v3.2.1 + major = random.randint(0, 20) + minor = random.randint(0, 30) + patch = random.randint(0, 99) + ver = f"v{major}.{minor}.{patch}" + en_ver = f"version {n2w(major, 'en')} point {n2w(minor, 'en')} point {n2w(patch, 'en')}" + pl_ver = f"wersja {n2w(major, 'pl')} kropka {n2w(minor, 'pl')} kropka {n2w(patch, 'pl')}" + + examples.append(make( + f"[en] Upgrade to {ver}.", + f"Upgrade to {en_ver}." + )) + examples.append(make( + f"[pl] Zaktualizuj do {ver}.", + f"Zaktualizuj do {pl_ver}." + )) + + # IP: 192.168.1.100 + octets = [random.randint(0, 255) for _ in range(4)] + ip = '.'.join(str(o) for o in octets) + en_ip = ' dot '.join(n2w(o, 'en') for o in octets) + pl_ip = ' kropka '.join(n2w(o, 'pl') for o in octets) + + examples.append(make( + f"[en] Connect to {ip}.", + f"Connect to {en_ip}." + )) + examples.append(make( + f"[pl] Połącz się z {ip}.", + f"Połącz się z {pl_ip}." + )) + + return examples + + +def gen_passthrough(count=80): + """Clean text that should pass through unchanged — negative examples.""" + clean_en = [ + "Hello, how are you doing today?", + "The quick brown fox jumps over the lazy dog.", + "Please close the door when you leave.", + "She walked slowly through the garden.", + "The meeting went well yesterday.", + "I think we should reconsider the approach.", + "Thank you for your help with this project.", + "The weather is beautiful this morning.", + "Let me know if you need anything else.", + "We appreciate your continued support.", + "The results exceeded our expectations.", + "Can you send me the updated report?", + "The new design looks great.", + "I'll follow up with you next week.", + "The system is running smoothly now.", + ] + clean_pl = [ + "Cześć, jak się masz?", + "Szybki brązowy lis przeskakuje nad leniwym psem.", + "Proszę zamknąć drzwi, gdy wychodzisz.", + "Szła powoli przez ogród.", + "Spotkanie przebiegło dobrze wczoraj.", + "Myślę, że powinniśmy ponownie rozważyć podejście.", + "Dziękuję za pomoc przy tym projekcie.", + "Pogoda jest piękna tego ranka.", + "Daj mi znać, jeśli potrzebujesz czegoś jeszcze.", + "Doceniamy wasze ciągłe wsparcie.", + "Wyniki przekroczyły nasze oczekiwania.", + "Czy możesz wysłać mi zaktualizowany raport?", + "Nowy projekt wygląda świetnie.", + "Skontaktuję się z tobą w przyszłym tygodniu.", + "System działa teraz płynnie.", + ] + + examples = [] + for _ in range(count): + text = random.choice(clean_en) + examples.append(make(f"[en] {text}", text)) + text = random.choice(clean_pl) + examples.append(make(f"[pl] {text}", text)) + + return examples + + +def gen_mixed(count=150): + """Complex mixed examples with multiple normalization needs.""" + examples = [] + + mixed_en = [ + ( + "[en] The CPU usage hit 95% at 14:30, costing us $4,500 in SLA penalties.", + "The C P U usage hit ninety-five percent at fourteen thirty, costing us four thousand five hundred dollars in S L A penalties." + ), + ( + "[en] **WARNING**: Server at 192.168.1.42 returned HTTP 503 error.", + "WARNING: Server at one hundred ninety-two dot one hundred sixty-eight dot one dot forty-two returned H T T P five hundred three error." + ), + ( + "[en] Send the PDF to john@example.com by 05/30/2026.", + "Send the P D F to john at example dot com by May thirtieth, twenty twenty-six." + ), + ( + "[en] The NASA rover traveled 3.7km at 0.5km/h on Mars.", + "The NASA rover traveled three point seven kilometers at zero point five kilometers per hour on Mars." + ), + ( + "[en] Update to v2.4.1 — fixes #347 & improves GPU performance by 12%.", + "Update to version two point four point one — fixes number three hundred forty-seven and improves G P U performance by twelve percent." + ), + ( + "[en] The FBI & CIA issued a joint FAQ about the VPN breach affecting 10,000+ users.", + "The F B I and C I A issued a joint F A Q about the V P N breach affecting ten thousand plus users." + ), + ( + "[en] Meeting at 3:00 PM in room 401. Budget: €250,000. Contact HR ASAP.", + "Meeting at three PM in room four hundred one. Budget: two hundred fifty thousand euros. Contact H R A S A P." + ), + ( + "[en] The LED display shows -5°C. The ATM is 200m away on 3rd street.", + "The L E D display shows minus five degrees Celsius. The A T M is two hundred meters away on third street." + ), + ( + "[en] Check https://docs.python.org/api/v3 for the SDK documentation.", + "Check docs dot python dot org slash api slash v three for the S D K documentation." + ), + ( + "[en] Flight BA247 departs at 08:15 from gate 12B. ETA: 11:30.", + "Flight B A two forty-seven departs at eight fifteen from gate twelve B. E T A: eleven thirty." + ), + ( + "[en] The IoT device uses 2.4GHz WiFi & draws 5W at 12V DC.", + "The I o T device uses two point four gigahertz WiFi and draws five watts at twelve volts D C." + ), + ( + "[en] She recieved the USB drive with 256GB of ML training data.", + "She received the U S B drive with two hundred fifty-six gigabytes of M L training data." + ), + ( + "[en] Callsign: Tango Alpha Foxtrot. Coordinates: 52°N, 21°E.", + "Callsign: T A F. Coordinates: fifty-two degrees north, twenty-one degrees east." + ), + ] + + mixed_pl = [ + ( + "[pl] Użycie CPU osiągnęło 95% o 14:30, kosztując nas $4,500 kar SLA.", + "Użycie C P U osiągnęło dziewięćdziesiąt pięć procent o czternasta trzydzieści, kosztując nas cztery tysiące pięćset dolarów kar S L A." + ), + ( + "[pl] **UWAGA**: Serwer 192.168.1.42 zwrócił błąd HTTP 503.", + "UWAGA: Serwer sto dziewięćdziesiąt dwa kropka sto sześćdziesiąt osiem kropka jeden kropka czterdzieści dwa zwrócił błąd H T T P pięćset trzy." + ), + ( + "[pl] Wyślij PDF na kontakt@firma.pl do 30.05.2026.", + "Wyślij P D F na kontakt małpa firma dot P L do trzydziestego maja dwa tysiące dwudziestego szóstego." + ), + ( + "[pl] Łazik NASA przejechał 3,7km z prędkością 0,5km/h na Marsie.", + "Łazik NASA przejechał trzy przecinek siedem kilometrów z prędkością zero przecinek pięć kilometrów na godzinę na Marsie." + ), + ( + "[pl] Aktualizacja do v2.4.1 — naprawia #347 & poprawia wydajność GPU o 12%.", + "Aktualizacja do wersja dwa kropka cztery kropka jeden — naprawia numer trzysta czterdzieści siedem i poprawia wydajność G P U o dwanaście procent." + ), + ( + "[pl] ABW i CBA wydały wspólne FAQ o naruszeniu VPN dotyczącym 10000+ użytkowników.", + "A B W i C B A wydały wspólne F A Q o naruszeniu V P N dotyczącym dziesięć tysięcy plus użytkowników." + ), + ( + "[pl] Spotkanie o 15:00 w pokoju 401. Budżet: €250000. Skontaktuj się z HR jak najszybciej.", + "Spotkanie o piętnaście zero zero w pokoju czterysta jeden. Budżet: dwieście pięćdziesiąt tysięcy euro. Skontaktuj się z H R jak najszybciej." + ), + ( + "[pl] Wyświetlacz LED pokazuje -5°C. Bankomat jest 200m dalej na 3. ulicy.", + "Wyświetlacz L E D pokazuje minus pięć stopni Celsjusza. Bankomat jest dwieście metrów dalej na trzeciej ulicy." + ), + ( + "[pl] Sprawdź https://docs.python.org/api/v3 po dokumentację SDK.", + "Sprawdź docs dot python dot org slash api slash v trzy po dokumentację S D K." + ), + ( + "[pl] Lot BA247 odlatuje o 08:15 z bramki 12B. Przewidywany czas: 11:30.", + "Lot B A dwieście czterdzieści siedem odlatuje o osiem piętnaście z bramki dwanaście B. Przewidywany czas: jedenaście trzydzieści." + ), + ( + "[pl] Urządzenie IoT używa WiFi 2,4GHz i pobiera 5W przy 12V DC.", + "Urządzenie I o T używa WiFi dwa przecinek cztery gigaherców i pobiera pięć watów przy dwanaście woltów D C." + ), + ( + "[pl] Kupiła pendrive USB z 256GB danych treningowych ML za 149 zł.", + "Kupiła pendrive U S B z dwieście pięćdziesiąt sześć gigabajtów danych treningowych M L za sto czterdzieści dziewięć złotych." + ), + ( + "[pl] Znak wywoławczy: Tango Alfa Foxtrot. Współrzędne: 52°N, 21°E.", + "Znak wywoławczy: T A F. Współrzędne: pięćdziesiąt dwa stopnie północ, dwadzieścia jeden stopni wschód." + ), + ] + + for pair in mixed_en: + examples.append(make(pair[0], pair[1])) + for pair in mixed_pl: + examples.append(make(pair[0], pair[1])) + + # Generate more mixed by combining elements + for _ in range(count - len(mixed_en) - len(mixed_pl)): + n = random.randint(100, 9999) + acr = random.choice(list(SPELLED_ACRONYMS.keys())) + acr_sp = SPELLED_ACRONYMS[acr] + sym = random.choice(['$', '€', 'zł', '£']) + amount = random.randint(10, 99999) + + if random.random() < 0.5: + cur_raw = f"{sym}{amount:,}" if sym in {'$', '€', '£'} else f"{amount:,} {sym}" + examples.append(make( + f"[en] The {acr} report shows {n} entries totaling {cur_raw}.", + f"The {acr_sp} report shows {n2w(n, 'en')} entries totaling {fmt_currency(amount, sym, 'en')}." + )) + else: + cur_raw = f"{sym}{amount:,}" if sym in {'$', '€', '£'} else f"{amount:,} {sym}" + examples.append(make( + f"[pl] Raport {acr} pokazuje {n} wpisów na łącznie {cur_raw}.", + f"Raport {acr_sp} pokazuje {n2w(n, 'pl')} wpisów na łącznie {fmt_currency(amount, sym, 'pl')}." + )) + + return examples + + +def gen_abbreviations(count=40): + """Common abbreviations.""" + abbrevs_en = { + 'Mr.': 'Mister', 'Mrs.': 'Misses', 'Dr.': 'Doctor', + 'St.': 'Saint', 'Ave.': 'Avenue', 'Blvd.': 'Boulevard', + 'Jr.': 'Junior', 'Sr.': 'Senior', 'Prof.': 'Professor', + 'vs.': 'versus', 'etc.': 'et cetera', 'e.g.': 'for example', + 'i.e.': 'that is', 'approx.': 'approximately', + 'dept.': 'department', 'govt.': 'government', + 'inc.': 'incorporated', 'corp.': 'corporation', + } + abbrevs_pl = { + 'dr': 'doktor', 'prof.': 'profesor', 'mgr': 'magister', + 'inż.': 'inżynier', 'ul.': 'ulica', 'al.': 'aleja', + 'pl.': 'plac', 'os.': 'osiedle', 'nr': 'numer', + 'tel.': 'telefon', 'godz.': 'godzina', 'ok.': 'około', + 'np.': 'na przykład', 'tj.': 'to jest', 'itd.': 'i tak dalej', + 'itp.': 'i tym podobne', 'wg': 'według', 'dot.': 'dotyczący', + } + + examples = [] + for _ in range(count): + abbr, full = random.choice(list(abbrevs_en.items())) + examples.append(make( + f"[en] {abbr} Smith will arrive at 5.", + f"{full} Smith will arrive at five." + )) + + abbr, full = random.choice(list(abbrevs_pl.items())) + examples.append(make( + f"[pl] {abbr} Kowalski przyjedzie o 5.", + f"{full} Kowalski przyjedzie o piąta." + )) + + return examples + + +def gen_contractions(count=40): + """English contractions — normalize for TTS clarity.""" + contractions = { + "don't": "do not", "doesn't": "does not", "didn't": "did not", + "won't": "will not", "wouldn't": "would not", "couldn't": "could not", + "shouldn't": "should not", "can't": "cannot", "isn't": "is not", + "aren't": "are not", "wasn't": "was not", "weren't": "were not", + "hasn't": "has not", "haven't": "have not", "hadn't": "had not", + "I'm": "I am", "I've": "I have", "I'll": "I will", "I'd": "I would", + "you're": "you are", "you've": "you have", "you'll": "you will", + "he's": "he is", "she's": "she is", "it's": "it is", + "we're": "we are", "we've": "we have", "we'll": "we will", + "they're": "they are", "they've": "they have", "they'll": "they will", + "that's": "that is", "there's": "there is", "here's": "here is", + "let's": "let us", "who's": "who is", "what's": "what is", + } + + templates = [ + "{} the right approach.", + "I think {} a good idea.", + "{} going to work out.", + "They said {} ready yet.", + "{} what we expected.", + ] + + examples = [] + for _ in range(count): + contr, expanded = random.choice(list(contractions.items())) + tpl = random.choice(templates) + examples.append(make( + f"[en] {tpl.format(contr)}", + tpl.format(expanded) + )) + + return examples + + +def gen_large_numbers_shorthand(count=40): + """Large number shorthands: 4.5M, 2.3B, 1.2K etc.""" + suffixes = { + 'K': (1_000, 'thousand', 'tysięcy'), + 'M': (1_000_000, 'million', 'milionów'), + 'B': (1_000_000_000, 'billion', 'miliardów'), + } + + examples = [] + for _ in range(count): + suffix, (mult, en_word, pl_word) = random.choice(list(suffixes.items())) + whole = random.randint(1, 99) + frac = random.choice([0, random.randint(1, 9)]) + + if frac: + raw = f"{whole}.{frac}{suffix}" + en_spoken = f"{n2w(whole, 'en')} point {n2w(frac, 'en')} {en_word}" + pl_spoken = f"{n2w(whole, 'pl')} przecinek {n2w(frac, 'pl')} {pl_word}" + else: + raw = f"{whole}{suffix}" + en_spoken = f"{n2w(whole, 'en')} {en_word}" + pl_spoken = f"{n2w(whole, 'pl')} {pl_word}" + + for sym in ['$', '€', 'zł']: + if sym in {'$', '€'}: + cur_raw = f"{sym}{raw}" + else: + cur_raw = f"{raw} {sym}" + + if frac: + val_for_cur = whole # approximate — the model should learn the pattern + else: + val_for_cur = whole + + examples.append(make( + f"[en] Revenue reached {cur_raw}.", + f"Revenue reached {en_spoken} {en_currency_name(sym, 2)}." + )) + examples.append(make( + f"[pl] Przychody osiągnęły {cur_raw}.", + f"Przychody osiągnęły {pl_spoken} {pl_currency_name(sym, 5)}." + )) + + return examples + + +# ────────────────────────────────────────────────────────────── +# MAIN +# ────────────────────────────────────────────────────────────── + +def main(): + random.seed(SEED) + all_examples = [] + + generators = [ + ("numbers", gen_numbers, 200), + ("negatives_decimals", gen_negatives_decimals, 80), + ("ordinals", gen_ordinals, 80), + ("percentages", gen_percentages, 60), + ("dates", gen_dates, 100), + ("times", gen_times, 80), + ("currency", gen_currency, 120), + ("units", gen_units, 80), + ("temperatures", gen_temperatures, 40), + ("acronyms", gen_acronyms, 150), + ("nato", gen_nato, 100), + ("spelling", gen_spelling_corrections, 80), + ("urls_emails", gen_urls_emails, 80), + ("symbols", gen_symbols, 60), + ("markdown", gen_markdown, 60), + ("phone_numbers", gen_phone_numbers, 40), + ("versions_ips", gen_versions_ips, 40), + ("passthrough", gen_passthrough, 80), + ("mixed", gen_mixed, 150), + ("abbreviations", gen_abbreviations, 40), + ("contractions", gen_contractions, 40), + ("large_numbers", gen_large_numbers_shorthand, 40), + ] + + for name, gen_fn, count in generators: + examples = gen_fn(count) + print(f" {name}: {len(examples)} examples") + all_examples.extend(examples) + + random.shuffle(all_examples) + + with OUTPUT.open("w", encoding="utf-8") as f: + for ex in all_examples: + f.write(json.dumps(ex, ensure_ascii=False) + "\n") + + print(f"\nTotal: {len(all_examples)} examples -> {OUTPUT}") + + +if __name__ == "__main__": + main() diff --git a/train_tts_norm.py b/train_tts_norm.py new file mode 100644 index 0000000..75736c0 --- /dev/null +++ b/train_tts_norm.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +"""Train TTS normalization LoRA on Qwen2.5-7B-Instruct using Unsloth. + +Reads: tts_norm_dataset.jsonl (ShareGPT format) +Output: tts-norm-lora/ adapter (vLLM-compatible) + +Run on junkpile — RTX 2000 Ada 16GB. +""" + +import os +import torch +from pathlib import Path +from unsloth import FastLanguageModel +from unsloth.chat_templates import get_chat_template, standardize_sharegpt +from trl import SFTTrainer +from transformers import TrainingArguments +from datasets import load_dataset + +# ────────────────────────────────────────────────────────────── +# CONFIG +# ────────────────────────────────────────────────────────────── + +MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit" +DATASET_PATH = "tts_norm_dataset.jsonl" +OUTPUT_DIR = "./tts-norm-lora" +MAX_SEQ_LEN = 768 # TTS normalization is short text +LORA_RANK = 16 +LORA_ALPHA = 16 +BATCH_SIZE = 2 # small GPU — use grad accumulation +GRAD_ACCUM = 8 # effective batch = 16 +EPOCHS = 3 +LR = 2e-4 +WARMUP_STEPS = 30 +SAVE_STEPS = 100 +LOGGING_STEPS = 10 +SEED = 42 + +# ────────────────────────────────────────────────────────────── +# LOAD MODEL +# ────────────────────────────────────────────────────────────── + +print(f"Loading {MODEL_NAME}...") +model, tokenizer = FastLanguageModel.from_pretrained( + model_name=MODEL_NAME, + max_seq_length=MAX_SEQ_LEN, + load_in_4bit=True, + dtype=None, # auto-detect +) + +# Apply chat template +tokenizer = get_chat_template( + tokenizer, + chat_template="qwen-2.5", +) + +# ────────────────────────────────────────────────────────────── +# PEFT CONFIG +# ────────────────────────────────────────────────────────────── + +print("Applying LoRA...") +model = FastLanguageModel.get_peft_model( + model, + r=LORA_RANK, + lora_alpha=LORA_ALPHA, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + lora_dropout=0, + bias="none", + use_gradient_checkpointing="unsloth", + random_state=SEED, +) + +# ────────────────────────────────────────────────────────────── +# DATASET +# ────────────────────────────────────────────────────────────── + +print(f"Loading dataset from {DATASET_PATH}...") +dataset = load_dataset("json", data_files=DATASET_PATH, split="train") +print(f" {len(dataset)} examples loaded") + +# Standardize to ShareGPT format (handles from/value vs role/content) +dataset = standardize_sharegpt(dataset) + +# Pre-apply chat template via map — avoids formatting_func signature issues +def apply_template(examples): + """Apply Qwen2.5 chat template to conversations.""" + convos = examples["conversations"] + texts = [] + for convo in convos: + text = tokenizer.apply_chat_template( + convo, + tokenize=False, + add_generation_prompt=False, + ) + texts.append(text) + return {"text": texts} + +print("Applying chat template...") +dataset = dataset.map(apply_template, batched=True, num_proc=2) + +# ────────────────────────────────────────────────────────────── +# TRAINER +# ────────────────────────────────────────────────────────────── + +print("Setting up trainer...") +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=dataset, + dataset_text_field="text", + args=TrainingArguments( + output_dir=OUTPUT_DIR, + per_device_train_batch_size=BATCH_SIZE, + gradient_accumulation_steps=GRAD_ACCUM, + num_train_epochs=EPOCHS, + learning_rate=LR, + lr_scheduler_type="cosine", + warmup_steps=WARMUP_STEPS, + fp16=not torch.cuda.is_bf16_supported(), + bf16=torch.cuda.is_bf16_supported(), + logging_steps=LOGGING_STEPS, + save_steps=SAVE_STEPS, + save_total_limit=3, + seed=SEED, + optim="adamw_8bit", + weight_decay=0.01, + max_grad_norm=1.0, + report_to="none", + dataloader_num_workers=2, + ), + max_seq_length=MAX_SEQ_LEN, + dataset_num_proc=2, + packing=True, # pack short examples for efficiency +) + +# ────────────────────────────────────────────────────────────── +# TRAIN +# ────────────────────────────────────────────────────────────── + +print("Starting training...") +stats = trainer.train() +print(f"\nTraining complete!") +print(f" Total steps: {stats.global_step}") +print(f" Train loss: {stats.training_loss:.4f}") +print(f" Runtime: {stats.metrics['train_runtime']:.0f}s") + +# ────────────────────────────────────────────────────────────── +# SAVE ADAPTER +# ────────────────────────────────────────────────────────────── + +print(f"\nSaving adapter to {OUTPUT_DIR}...") +model.save_pretrained(OUTPUT_DIR) +tokenizer.save_pretrained(OUTPUT_DIR) + +# Verify +adapter_path = Path(OUTPUT_DIR) / "adapter_model.safetensors" +if adapter_path.exists(): + size_mb = adapter_path.stat().st_size / (1024 * 1024) + print(f" Adapter saved: {size_mb:.1f} MB") +else: + print(" WARNING: adapter_model.safetensors not found!") + +config_path = Path(OUTPUT_DIR) / "adapter_config.json" +if config_path.exists(): + print(f" Config saved: {config_path}") + +print(f"\nDone. Serve with:") +print(f" vllm serve Qwen/Qwen2.5-7B-Instruct \\") +print(f" --enable-lora \\") +print(f" --lora-modules tts-norm={os.path.abspath(OUTPUT_DIR)} \\") +print(f" --max-lora-rank {LORA_RANK}")