diff --git a/gen_tts_dataset.py b/gen_tts_dataset.py
new file mode 100644
index 0000000..3fa1632
--- /dev/null
+++ b/gen_tts_dataset.py
@@ -0,0 +1,1352 @@
+#!/usr/bin/env python3
+"""Generate TTS normalization LoRA training dataset.
+
+Language is EXPLICITLY tagged [en] or [pl] by the TTS client — no detection/inference.
+Currency, units, and content are language-independent (Polish text can have dollars,
+English text can have złoty).
+
+Output: JSONL in ShareGPT conversation format for Unsloth/Qwen2.5-Instruct.
+"""
+
+import json
+import random
+import re
+from pathlib import Path
+from num2words import num2words
+
+SEED = 42
+OUTPUT = Path("tts_norm_dataset.jsonl")
+
+SYSTEM = (
+    "You are a TTS text preprocessor. The input begins with a language tag [en] or [pl] "
+    "that specifies the target speech language. Normalize the text for natural speech synthesis: "
+    "expand numbers to words in the tagged language, handle acronyms (spell out letter-acronyms "
+    "with spaces between letters, keep pronounceable acronyms intact), fix spelling errors, "
+    "convert symbols and URLs to spoken form, and strip markdown formatting. "
+    "Output only the normalized text without the language tag."
+)
+
+# ──────────────────────────────────────────────────────────────
+# NATO PHONETIC ALPHABET
+# ──────────────────────────────────────────────────────────────
+NATO = {
+    'A': 'Alfa', 'B': 'Bravo', 'C': 'Charlie', 'D': 'Delta',
+    'E': 'Echo', 'F': 'Foxtrot', 'G': 'Golf', 'H': 'Hotel',
+    'I': 'India', 'J': 'Juliett', 'K': 'Kilo', 'L': 'Lima',
+    'M': 'Mike', 'N': 'November', 'O': 'Oscar', 'P': 'Papa',
+    'Q': 'Quebec', 'R': 'Romeo', 'S': 'Sierra', 'T': 'Tango',
+    'U': 'Uniform', 'V': 'Victor', 'W': 'Whiskey', 'X': 'X-ray',
+    'Y': 'Yankee', 'Z': 'Zulu',
+}
+
+# ──────────────────────────────────────────────────────────────
+# ACRONYMS — spelled letter-by-letter
+# ──────────────────────────────────────────────────────────────
+SPELLED_ACRONYMS = {
+    'API': 'A P I', 'CPU': 'C P U', 'GPU': 'G P U', 'RAM': 'R A M',
+    'SSD': 'S S D', 'HTTP': 'H T T P', 'HTTPS': 'H T T P S',
+    'URL': 'U R L', 'HTML': 'H T M L', 'CSS': 'C S S',
+    'FBI': 'F B I', 'CIA': 'C I A', 'NFL': 'N F L', 'NBA': 'N B A',
+    'MVP': 'M V P', 'CEO': 'C E O', 'CFO': 'C F O', 'CTO': 'C T O',
+    'AI': 'A I', 'ML': 'M L', 'OS': 'O S', 'UI': 'U I', 'UX': 'U X',
+    'IP': 'I P', 'VPN': 'V P N', 'DNS': 'D N S', 'SSH': 'S S H',
+    'FTP': 'F T P', 'SQL': 'S Q L', 'XML': 'X M L', 'JSON': 'J S O N',
+    'AWS': 'A W S', 'USB': 'U S B', 'HDMI': 'H D M I',
+    'LCD': 'L C D', 'LED': 'L E D', 'PDF': 'P D F', 'FAQ': 'F A Q',
+    'DIY': 'D I Y', 'ETA': 'E T A', 'FYI': 'F Y I', 'ASAP': 'A S A P',
+    'GPS': 'G P S', 'ATM': 'A T M', 'BBC': 'B B C', 'CNN': 'C N N',
+    'EU': 'E U', 'UN': 'U N', 'UK': 'U K', 'US': 'U S',
+    'RGB': 'R G B', 'TCP': 'T C P', 'UDP': 'U D P',
+    'IDE': 'I D E', 'SDK': 'S D K', 'CI': 'C I', 'CD': 'C D',
+    'PR': 'P R', 'QA': 'Q A', 'IT': 'I T', 'HR': 'H R',
+    'PKP': 'P K P', 'PZU': 'P Z U', 'ZUS': 'Z U S', 'NFZ': 'N F Z',
+    'PKO': 'P K O', 'TVP': 'T V P', 'TVN': 'T V N',
+    'KRS': 'K R S', 'NIP': 'N I P', 'VAT': 'V A T',
+    'ZTM': 'Z T M', 'MPK': 'M P K', 'GUS': 'G U S', 'PGE': 'P G E',
+    'NBP': 'N B P', 'PKB': 'P K B', 'UE': 'U E', 'ONZ': 'O N Z',
+    'RPO': 'R P O', 'NIK': 'N I K', 'SLD': 'S L D',
+    'PIS': 'P I S', 'PSL': 'P S L', 'PCK': 'P C K',
+    'AGH': 'A G H', 'UJ': 'U J', 'UW': 'U W', 'PWN': 'P W N',
+    'IPN': 'I P N', 'ABW': 'A B W', 'CBA': 'C B A',
+    'PIT': 'P I T', 'CIT': 'C I T', 'PESEL': 'P E S E L',
+    'TTS': 'T T S', 'NPC': 'N P C', 'RPG': 'R P G', 'PVP': 'P V P',
+    'DPS': 'D P S', 'MMO': 'M M O', 'RNG': 'R N G', 'AFK': 'A F K',
+    'LLM': 'L L M', 'NLP': 'N L P', 'OCR': 'O C R', 'ORM': 'O R M',
+}
+
+# Acronyms pronounced as words (not spelled)
+WORD_ACRONYMS = [
+    'NASA', 'NATO', 'LASER', 'RADAR', 'SCUBA', 'PIN', 'SIM', 'BIOS',
+    'CAPTCHA', 'AWOL', 'GIF', 'JPEG', 'GOPR', 'IKEA', 'FIAT',
+    'UNICEF', 'UNESCO', 'AIDS', 'COVID', 'BASIC', 'SWIFT', 'DART',
+    'RUST', 'AJAX', 'LIDAR', 'MODEM', 'PIXEL',
+]
+
+# Words that LOOK like acronyms (all-caps) but are just regular words
+NOT_ACRONYMS = [
+    'NVIDIA', 'HELLO', 'STOP', 'WARNING', 'ERROR', 'DANGER',
+    'IMPORTANT', 'NOTE', 'URGENT', 'ATTENTION', 'WELCOME', 'EXIT',
+    'OPEN', 'CLOSE', 'START', 'FINISH', 'UWAGA', 'STOP', 'WEJŚCIE',
+    'WYJŚCIE', 'ZAMKNIĘTE', 'OTWARTE',
+]
+
+# ──────────────────────────────────────────────────────────────
+# SPELLING ERRORS
+# ──────────────────────────────────────────────────────────────
+TYPOS_EN = {
+    'recieve': 'receive', 'definately': 'definitely', 'occured': 'occurred',
+    'seperate': 'separate', 'accomodate': 'accommodate', 'neccessary': 'necessary',
+    'wierd': 'weird', 'occassion': 'occasion', 'concious': 'conscious',
+    'enviroment': 'environment', 'goverment': 'government', 'independant': 'independent',
+    'knowlege': 'knowledge', 'langauge': 'language', 'maintainance': 'maintenance',
+    'millenium': 'millennium', 'noticable': 'noticeable', 'persistant': 'persistent',
+    'publically': 'publicly', 'recomend': 'recommend', 'refering': 'referring',
+    'successfull': 'successful', 'suprise': 'surprise', 'tommorow': 'tomorrow',
+    'untill': 'until', 'wether': 'whether', 'wich': 'which',
+    'thier': 'their', 'teh': 'the', 'adn': 'and', 'hte': 'the',
+    'becuase': 'because', 'beleive': 'believe', 'calender': 'calendar',
+    'collegue': 'colleague', 'comittee': 'committee', 'dissapoint': 'disappoint',
+    'embarass': 'embarrass', 'existance': 'existence', 'foriegn': 'foreign',
+    'gurantee': 'guarantee', 'harrass': 'harass', 'imediately': 'immediately',
+    'jewlery': 'jewelry', 'judgement': 'judgment', 'liason': 'liaison',
+}
+
+TYPOS_PL = {
+    'rząd': 'rząd',  # correct, no-op example
+    'wziąść': 'wziąć', 'włanczyć': 'włączyć', 'poszłem': 'poszedłem',
+    'pokarze': 'pokaże', 'napewno': 'na pewno', 'wogóle': 'w ogóle',
+    'niewiem': 'nie wiem', 'przedewszystkim': 'przede wszystkim',
+    'conajmniej': 'co najmniej', 'natomist': 'natomiast',
+    'ponieważ': 'ponieważ',  # correct
+    'żadko': 'rzadko', 'bynajmiej': 'bynajmniej',
+    'jakby': 'jakby',  # correct
+    'pomóżcie': 'pomóżcie',  # correct
+    'włożyłam': 'włożyłam',  # correct
+    'spróbój': 'spróbuj', 'wyrzygnąć': 'wyrzucić',
+    'przyjeżdzać': 'przyjeżdżać', 'żółtko': 'żółtko',  # correct
+    'gżegżółka': 'gżegżółka',  # correct — hard word
+    'sprawdźić': 'sprawdzić', 'ząmówić': 'zamówić',
+    'orginalny': 'oryginalny', 'symultaniczny': 'symultaniczny',
+    'odzwyczaić': 'odzwyczaić',  # correct
+    'czterysta': 'czterysta',  # correct
+    'rzentelny': 'rzetelny', 'wchodzić': 'wchodzić',  # correct
+    'porządże': 'porządze',  # intentional trap
+}
+
+# ──────────────────────────────────────────────────────────────
+# SENTENCE TEMPLATES (with {} placeholders)
+# ──────────────────────────────────────────────────────────────
+NUM_TEMPLATES_EN = [
+    "There are {} items in the queue.",
+    "The building has {} floors.",
+    "She scored {} points in the final round.",
+    "Approximately {} people attended the event.",
+    "{} units were shipped yesterday.",
+    "The file contains {} lines of code.",
+    "He waited {} minutes for the train.",
+    "The population reached {} last year.",
+    "We need {} more signatures.",
+    "The distance is {} meters.",
+    "Page {} of the document.",
+    "Chapter {} covers advanced topics.",
+    "Flight {} departs at noon.",
+    "Room {} is on the third floor.",
+    "The team completed {} sprints this quarter.",
+]
+
+NUM_TEMPLATES_PL = [
+    "W kolejce jest {} elementów.",
+    "Budynek ma {} pięter.",
+    "Zdobyła {} punktów w finale.",
+    "Na wydarzeniu pojawiło się około {} osób.",
+    "Wysłano {} jednostek wczoraj.",
+    "Plik zawiera {} linii kodu.",
+    "Czekał {} minut na pociąg.",
+    "Populacja osiągnęła {} w zeszłym roku.",
+    "Potrzebujemy jeszcze {} podpisów.",
+    "Odległość wynosi {} metrów.",
+    "Strona {} dokumentu.",
+    "Rozdział {} obejmuje zaawansowane tematy.",
+    "Lot {} odlatuje w południe.",
+    "Pokój {} jest na trzecim piętrze.",
+    "Zespół ukończył {} sprintów w tym kwartale.",
+]
+
+CURRENCY_TEMPLATES_EN = [
+    "The total is {}.",
+    "She paid {}.",
+    "The budget was set at {}.",
+    "It costs {} per unit.",
+    "They raised {} for charity.",
+    "The invoice shows {}.",
+    "Repairs will cost approximately {}.",
+    "The price dropped to {}.",
+]
+
+CURRENCY_TEMPLATES_PL = [
+    "Łącznie to {}.",
+    "Zapłaciła {}.",
+    "Budżet ustalono na {}.",
+    "Kosztuje {} za sztukę.",
+    "Zebrali {} na cele charytatywne.",
+    "Faktura pokazuje {}.",
+    "Naprawa będzie kosztować około {}.",
+    "Cena spadła do {}.",
+]
+
+ACRONYM_TEMPLATES_EN = [
+    "The {} system is down.",
+    "Check the {} settings.",
+    "We're migrating to {}.",
+    "The {} update is ready.",
+    "{} performance improved significantly.",
+    "Connect via {} to the server.",
+    "The {} team approved the request.",
+]
+
+ACRONYM_TEMPLATES_PL = [
+    "System {} nie działa.",
+    "Sprawdź ustawienia {}.",
+    "Migrujemy do {}.",
+    "Aktualizacja {} jest gotowa.",
+    "Wydajność {} znacząco się poprawiła.",
+    "Połącz się przez {} z serwerem.",
+    "Zespół {} zatwierdził wniosek.",
+]
+
+# ──────────────────────────────────────────────────────────────
+# HELPERS
+# ──────────────────────────────────────────────────────────────
+
+def n2w(n, lang):
+    """Number to words via num2words. Handles negative bug in PL."""
+    if isinstance(n, (int, float)) and n < 0:
+        return f"minus {num2words(abs(n), lang=lang)}"
+    return num2words(n, lang=lang)
+
+def n2w_ordinal(n, lang):
+    """Ordinal number to words."""
+    return num2words(n, to='ordinal', lang=lang)
+
+def en_currency_name(symbol, n):
+    """Currency name in English for any symbol."""
+    names = {
+        '$': ('dollar', 'dollars'),
+        '€': ('euro', 'euros'),
+        'zł': ('zloty', 'zloty'),
+        '£': ('pound', 'pounds'),
+        '¥': ('yen', 'yen'),
+        'CHF': ('Swiss franc', 'Swiss francs'),
+        'kr': ('krona', 'kronor'),
+    }
+    sing, plur = names.get(symbol, (symbol, symbol))
+    return sing if abs(n) == 1 else plur
+
+def pl_currency_name(symbol, n):
+    """Currency name in Polish for any symbol — with declension."""
+    last_d = abs(n) % 10
+    last_2d = abs(n) % 100
+
+    def pick(one, few, many):
+        if abs(n) == 1:
+            return one
+        elif 2 <= last_d <= 4 and not (12 <= last_2d <= 14):
+            return few
+        else:
+            return many
+
+    table = {
+        '$':  ('dolar', 'dolary', 'dolarów'),
+        '€':  ('euro', 'euro', 'euro'),
+        'zł': ('złoty', 'złote', 'złotych'),
+        '£':  ('funt', 'funty', 'funtów'),
+        '¥':  ('jen', 'jeny', 'jenów'),
+        'CHF': ('frank szwajcarski', 'franki szwajcarskie', 'franków szwajcarskich'),
+        'kr': ('korona', 'korony', 'koron'),
+    }
+    forms = table.get(symbol, (symbol, symbol, symbol))
+    return pick(*forms)
+
+def fmt_currency(n, symbol, lang):
+    """Format currency amount as spoken text."""
+    word_n = n2w(n, lang)
+    if lang == 'en':
+        return f"{word_n} {en_currency_name(symbol, n)}"
+    else:
+        return f"{word_n} {pl_currency_name(symbol, n)}"
+
+def make(raw, norm):
+    """Make one ShareGPT conversation example."""
+    return {
+        "conversations": [
+            {"from": "system", "value": SYSTEM},
+            {"from": "human", "value": raw},
+            {"from": "gpt", "value": norm},
+        ]
+    }
+
+# ──────────────────────────────────────────────────────────────
+# GENERATORS
+# ──────────────────────────────────────────────────────────────
+
+def gen_numbers(count=200):
+    """Cardinal numbers in sentence context."""
+    examples = []
+    ranges = [
+        (0, 20), (21, 99), (100, 999), (1000, 9999),
+        (10000, 99999), (100000, 999999),
+    ]
+    for _ in range(count):
+        lo, hi = random.choice(ranges)
+        n = random.randint(lo, hi)
+
+        # English
+        tpl = random.choice(NUM_TEMPLATES_EN)
+        examples.append(make(
+            f"[en] {tpl.format(n)}",
+            tpl.format(n2w(n, 'en'))
+        ))
+
+        # Polish
+        tpl = random.choice(NUM_TEMPLATES_PL)
+        examples.append(make(
+            f"[pl] {tpl.format(n)}",
+            tpl.format(n2w(n, 'pl'))
+        ))
+
+    return examples
+
+
+def gen_negatives_decimals(count=80):
+    """Negative numbers and decimals."""
+    examples = []
+    for _ in range(count):
+        # Negative integers
+        n = -random.randint(1, 500)
+        examples.append(make(
+            f"[en] The temperature is {n} degrees.",
+            f"The temperature is {n2w(n, 'en')} degrees."
+        ))
+        examples.append(make(
+            f"[pl] Temperatura wynosi {n} stopni.",
+            f"Temperatura wynosi {n2w(n, 'pl')} stopni."
+        ))
+
+        # Decimals
+        whole = random.randint(0, 999)
+        frac = random.randint(1, 99)
+        dec_str = f"{whole}.{frac}"
+        dec_val = float(dec_str)
+
+        en_spoken = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in str(frac))}"
+        pl_spoken = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in str(frac))}"
+
+        examples.append(make(
+            f"[en] The measurement reads {dec_str}.",
+            f"The measurement reads {en_spoken}."
+        ))
+        examples.append(make(
+            f"[pl] Pomiar wskazuje {dec_str}.",
+            f"Pomiar wskazuje {pl_spoken}."
+        ))
+
+    return examples
+
+
+def gen_ordinals(count=80):
+    """Ordinal numbers."""
+    examples = []
+    for _ in range(count):
+        n = random.randint(1, 100)
+        suffix_raw = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
+        if 11 <= n % 100 <= 13:
+            suffix_raw = 'th'
+
+        examples.append(make(
+            f"[en] This is the {n}{suffix_raw} attempt.",
+            f"This is the {n2w_ordinal(n, 'en')} attempt."
+        ))
+        examples.append(make(
+            f"[pl] To jest {n}. próba.",
+            f"To jest {n2w_ordinal(n, 'pl')} próba."
+        ))
+    return examples
+
+
+def gen_percentages(count=60):
+    """Percentage expressions."""
+    examples = []
+    for _ in range(count):
+        n = random.choice([
+            random.randint(0, 100),
+            round(random.uniform(0, 100), 1),
+        ])
+
+        if isinstance(n, float):
+            whole = int(n)
+            frac = str(n).split('.')[1]
+            en_w = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in frac)}"
+            pl_w = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in frac)}"
+        else:
+            en_w = n2w(n, 'en')
+            pl_w = n2w(n, 'pl')
+
+        examples.append(make(
+            f"[en] The success rate is {n}%.",
+            f"The success rate is {en_w} percent."
+        ))
+        examples.append(make(
+            f"[pl] Wskaźnik sukcesu wynosi {n}%.",
+            f"Wskaźnik sukcesu wynosi {pl_w} procent."
+        ))
+    return examples
+
+
+def gen_dates(count=100):
+    """Date expressions in various formats."""
+    examples = []
+
+    months_en = ['January', 'February', 'March', 'April', 'May', 'June',
+                 'July', 'August', 'September', 'October', 'November', 'December']
+    months_pl_gen = ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
+                     'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia']
+
+    for _ in range(count):
+        day = random.randint(1, 28)
+        month = random.randint(1, 12)
+        year = random.randint(1990, 2030)
+
+        # EN: "05/25/2026" or "25.05.2026" or "2026-05-25"
+        en_day_ord = n2w_ordinal(day, 'en')
+        en_year = n2w(year, 'en')
+        en_spoken = f"{months_en[month-1]} {en_day_ord}, {en_year}"
+
+        fmt = random.choice(['us', 'eu', 'iso'])
+        if fmt == 'us':
+            raw_date = f"{month:02d}/{day:02d}/{year}"
+        elif fmt == 'eu':
+            raw_date = f"{day:02d}.{month:02d}.{year}"
+        else:
+            raw_date = f"{year}-{month:02d}-{day:02d}"
+
+        examples.append(make(
+            f"[en] The deadline is {raw_date}.",
+            f"The deadline is {en_spoken}."
+        ))
+
+        # PL: "25.05.2026" or "25 maja 2026"
+        pl_day_ord = n2w_ordinal(day, 'pl')
+        pl_year = n2w(year, 'pl')
+        pl_spoken = f"{pl_day_ord} {months_pl_gen[month-1]} {pl_year}"
+
+        raw_date_pl = f"{day:02d}.{month:02d}.{year}"
+        examples.append(make(
+            f"[pl] Termin to {raw_date_pl}.",
+            f"Termin to {pl_spoken}."
+        ))
+
+    return examples
+
+
+def gen_times(count=80):
+    """Time expressions — 12h/24h."""
+    examples = []
+    for _ in range(count):
+        h24 = random.randint(0, 23)
+        m = random.randint(0, 59)
+
+        # 24h format
+        time_str = f"{h24:02d}:{m:02d}"
+        en_h = n2w(h24, 'en')
+        en_m = n2w(m, 'en') if m != 0 else ""
+        en_spoken = f"{en_h} {en_m}".strip() if m != 0 else f"{en_h} hundred" if h24 > 0 else "midnight"
+
+        pl_h = n2w(h24, 'pl')
+        pl_m = n2w(m, 'pl') if m != 0 else ""
+        pl_spoken = f"{pl_h} {pl_m}".strip()
+
+        examples.append(make(
+            f"[en] The meeting is at {time_str}.",
+            f"The meeting is at {en_spoken}."
+        ))
+        examples.append(make(
+            f"[pl] Spotkanie jest o {time_str}.",
+            f"Spotkanie jest o {pl_spoken}."
+        ))
+
+        # 12h format (EN only common)
+        h12 = h24 % 12 or 12
+        ampm = "AM" if h24 < 12 else "PM"
+        time_12 = f"{h12}:{m:02d} {ampm}"
+        en_12_spoken = f"{n2w(h12, 'en')} {n2w(m, 'en') if m else ''} {ampm}".strip()
+        examples.append(make(
+            f"[en] Alarm set for {time_12}.",
+            f"Alarm set for {en_12_spoken}."
+        ))
+
+    return examples
+
+
+def gen_currency(count=120):
+    """Currency — any currency symbol in any language."""
+    examples = []
+    symbols = ['$', '€', 'zł', '£']
+    prefixed = {'$', '€', '£'}  # symbol before number
+    postfixed = {'zł'}  # symbol after number
+
+    for _ in range(count):
+        sym = random.choice(symbols)
+        n = random.choice([
+            random.randint(1, 99),
+            random.randint(100, 9999),
+            random.randint(10000, 999999),
+        ])
+
+        if sym in prefixed:
+            raw_amount = f"{sym}{n:,}"
+        else:
+            raw_amount = f"{n:,} {sym}"
+
+        # English
+        tpl = random.choice(CURRENCY_TEMPLATES_EN)
+        examples.append(make(
+            f"[en] {tpl.format(raw_amount)}",
+            tpl.format(fmt_currency(n, sym, 'en'))
+        ))
+
+        # Polish
+        tpl = random.choice(CURRENCY_TEMPLATES_PL)
+        examples.append(make(
+            f"[pl] {tpl.format(raw_amount)}",
+            tpl.format(fmt_currency(n, sym, 'pl'))
+        ))
+
+    # Cents/grosze
+    for _ in range(20):
+        whole = random.randint(1, 999)
+        cents = random.randint(1, 99)
+        sym = random.choice(symbols)
+
+        if sym in prefixed:
+            raw = f"{sym}{whole}.{cents:02d}"
+        else:
+            raw = f"{whole}.{cents:02d} {sym}"
+
+        en_spoken = f"{n2w(whole, 'en')} {en_currency_name(sym, whole)} and {n2w(cents, 'en')} cents"
+        pl_spoken = f"{n2w(whole, 'pl')} {pl_currency_name(sym, whole)} i {n2w(cents, 'pl')} groszy"
+
+        examples.append(make(f"[en] Total: {raw}", f"Total: {en_spoken}"))
+        examples.append(make(f"[pl] Łącznie: {raw}", f"Łącznie: {pl_spoken}"))
+
+    return examples
+
+
+def gen_units(count=80):
+    """Units and measurements."""
+    units = {
+        'km':  ('kilometers', 'kilometrów'),
+        'm':   ('meters', 'metrów'),
+        'cm':  ('centimeters', 'centymetrów'),
+        'mm':  ('millimeters', 'milimetrów'),
+        'kg':  ('kilograms', 'kilogramów'),
+        'g':   ('grams', 'gramów'),
+        'mg':  ('milligrams', 'miligramów'),
+        'l':   ('liters', 'litrów'),
+        'ml':  ('milliliters', 'mililitrów'),
+        'km/h': ('kilometers per hour', 'kilometrów na godzinę'),
+        'mph': ('miles per hour', 'mil na godzinę'),
+        'GB':  ('gigabytes', 'gigabajtów'),
+        'MB':  ('megabytes', 'megabajtów'),
+        'TB':  ('terabytes', 'terabajtów'),
+        'GHz': ('gigahertz', 'gigaherców'),
+        'MHz': ('megahertz', 'megaherców'),
+        'kW':  ('kilowatts', 'kilowatów'),
+        'W':   ('watts', 'watów'),
+        'V':   ('volts', 'woltów'),
+        'A':   ('amperes', 'amperów'),
+    }
+
+    templates_en = [
+        "The speed is {} {}.",
+        "It weighs {} {}.",
+        "The capacity is {} {}.",
+        "Measured {} {}.",
+        "Maximum: {} {}.",
+    ]
+    templates_pl = [
+        "Prędkość wynosi {} {}.",
+        "Waży {} {}.",
+        "Pojemność to {} {}.",
+        "Zmierzono {} {}.",
+        "Maksimum: {} {}.",
+    ]
+
+    examples = []
+    for _ in range(count):
+        unit, (en_name, pl_name) = random.choice(list(units.items()))
+        n = random.choice([
+            random.randint(1, 99),
+            random.randint(100, 9999),
+            round(random.uniform(0.1, 99.9), 1),
+        ])
+
+        raw_n = str(n)
+        if isinstance(n, float):
+            whole = int(n)
+            frac = str(n).split('.')[1]
+            en_w = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in frac)}"
+            pl_w = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in frac)}"
+        else:
+            en_w = n2w(n, 'en')
+            pl_w = n2w(n, 'pl')
+
+        tpl = random.choice(templates_en)
+        examples.append(make(
+            f"[en] {tpl.format(raw_n, unit)}",
+            tpl.format(en_w, en_name)
+        ))
+
+        tpl = random.choice(templates_pl)
+        examples.append(make(
+            f"[pl] {tpl.format(raw_n, unit)}",
+            tpl.format(pl_w, pl_name)
+        ))
+
+    return examples
+
+
+def gen_temperatures(count=40):
+    """Temperature expressions."""
+    examples = []
+    for _ in range(count):
+        t = random.randint(-30, 45)
+        examples.append(make(
+            f"[en] Current temperature: {t}°C.",
+            f"Current temperature: {n2w(t, 'en')} degrees Celsius."
+        ))
+        examples.append(make(
+            f"[pl] Aktualna temperatura: {t}°C.",
+            f"Aktualna temperatura: {n2w(t, 'pl')} stopni Celsjusza."
+        ))
+
+        f_temp = random.randint(0, 110)
+        examples.append(make(
+            f"[en] It's {f_temp}°F outside.",
+            f"It's {n2w(f_temp, 'en')} degrees Fahrenheit outside."
+        ))
+
+    return examples
+
+
+def gen_acronyms(count=150):
+    """Acronyms — spelled out vs pronounced as words."""
+    examples = []
+    spelled_items = list(SPELLED_ACRONYMS.items())
+
+    for _ in range(count):
+        # Spelled acronym in sentence
+        acr, spelled = random.choice(spelled_items)
+
+        tpl = random.choice(ACRONYM_TEMPLATES_EN)
+        examples.append(make(
+            f"[en] {tpl.format(acr)}",
+            tpl.format(spelled)
+        ))
+
+        tpl = random.choice(ACRONYM_TEMPLATES_PL)
+        examples.append(make(
+            f"[pl] {tpl.format(acr)}",
+            tpl.format(spelled)
+        ))
+
+        # Word acronym (should NOT be spelled)
+        word_acr = random.choice(WORD_ACRONYMS)
+        examples.append(make(
+            f"[en] The {word_acr} project launched.",
+            f"The {word_acr} project launched."
+        ))
+        examples.append(make(
+            f"[pl] Projekt {word_acr} wystartował.",
+            f"Projekt {word_acr} wystartował."
+        ))
+
+        # Not-acronym all-caps (should lowercase or preserve)
+        not_acr = random.choice(NOT_ACRONYMS)
+        examples.append(make(
+            f"[en] {not_acr}: read the instructions.",
+            f"{not_acr}: read the instructions."
+        ))
+
+    return examples
+
+
+def gen_nato(count=100):
+    """NATO phonetic alphabet — both encoding and decoding."""
+    examples = []
+    letters = list(NATO.keys())
+
+    for _ in range(count):
+        # Decode: "Alpha Bravo Charlie" → "ABC"
+        length = random.randint(2, 6)
+        chosen = random.sample(letters, length)
+        nato_str = ' '.join(NATO[c] for c in chosen)
+        letter_str = ''.join(chosen)
+        spelled_str = ' '.join(chosen)
+
+        examples.append(make(
+            f"[en] Callsign: {nato_str}.",
+            f"Callsign: {spelled_str}."
+        ))
+        examples.append(make(
+            f"[pl] Znak wywoławczy: {nato_str}.",
+            f"Znak wywoławczy: {spelled_str}."
+        ))
+
+        # Encode: spelled letters → NATO (reverse direction)
+        examples.append(make(
+            f"[en] Spell {letter_str} using NATO phonetic.",
+            f"{nato_str}."
+        ))
+        examples.append(make(
+            f"[pl] Przeliteruj {letter_str} alfabetem NATO.",
+            f"{nato_str}."
+        ))
+
+        # Decode in running text
+        word = ''.join(random.sample(letters, random.randint(3, 5)))
+        nato_word = ' '.join(NATO[c] for c in word)
+        spelled_word = ' '.join(word)
+
+        examples.append(make(
+            f"[en] The code is {nato_word}.",
+            f"The code is {spelled_word}."
+        ))
+
+    return examples
+
+
+def gen_spelling_corrections(count=80):
+    """Fix common spelling errors."""
+    examples = []
+
+    en_items = [(bad, good) for bad, good in TYPOS_EN.items() if bad != good]
+    pl_items = [(bad, good) for bad, good in TYPOS_PL.items() if bad != good]
+
+    templates_en = [
+        "The {} was unexpected.",
+        "We need to {} the system.",
+        "It was {} to everyone.",
+        "The {} process failed.",
+        "Please {} the document.",
+    ]
+    templates_pl = [
+        "To było {}.",
+        "Trzeba {} system.",
+        "Było to {} dla wszystkich.",
+        "Proces {} się nie powiódł.",
+        "Proszę {} dokument.",
+    ]
+
+    for _ in range(count):
+        if en_items:
+            bad, good = random.choice(en_items)
+            tpl = random.choice(templates_en)
+            examples.append(make(
+                f"[en] {tpl.format(bad)}",
+                tpl.format(good)
+            ))
+
+        if pl_items:
+            bad, good = random.choice(pl_items)
+            tpl = random.choice(templates_pl)
+            examples.append(make(
+                f"[pl] {tpl.format(bad)}",
+                tpl.format(good)
+            ))
+
+    return examples
+
+
+def gen_urls_emails(count=80):
+    """URLs and email addresses to spoken form."""
+    examples = []
+
+    domains = ['github.com', 'google.com', 'example.org', 'openai.com',
+               'huggingface.co', 'reddit.com', 'stackoverflow.com',
+               'wikipedia.org', 'youtube.com', 'docs.python.org']
+    paths = ['/docs', '/api/v2', '/user/settings', '/search?q=test',
+             '/releases/latest', '/issues/42', '/wiki/Main_Page']
+    tlds = {'com': 'dot com', 'org': 'dot org', 'co': 'dot co',
+            'net': 'dot net', 'pl': 'dot P L', 'dev': 'dot dev'}
+    users = ['john', 'anna', 'admin', 'support', 'info', 'kontakt']
+    email_domains = ['gmail.com', 'outlook.com', 'firma.pl', 'example.org']
+
+    for _ in range(count):
+        # URL
+        domain = random.choice(domains)
+        path = random.choice(paths + [''])
+        url = f"https://{domain}{path}"
+
+        parts = domain.split('.')
+        tld = parts[-1]
+        name = ' dot '.join(parts[:-1])
+        spoken_domain = f"{name} {tlds.get(tld, f'dot {tld}')}"
+        spoken_path = path.replace('/', ' slash ').replace('?', ' question mark ').replace('=', ' equals ').replace('#', ' hash ').strip() if path else ''
+        spoken_url = f"{spoken_domain} {spoken_path}".strip()
+
+        examples.append(make(
+            f"[en] Visit {url} for details.",
+            f"Visit {spoken_url} for details."
+        ))
+        examples.append(make(
+            f"[pl] Odwiedź {url} po szczegóły.",
+            f"Odwiedź {spoken_url} po szczegóły."
+        ))
+
+        # Email
+        user = random.choice(users)
+        edom = random.choice(email_domains)
+        email = f"{user}@{edom}"
+        eparts = edom.split('.')
+        etld = eparts[-1]
+        ename = ' dot '.join(eparts[:-1])
+        spoken_email = f"{user} at {ename} {tlds.get(etld, f'dot {etld}')}"
+
+        examples.append(make(
+            f"[en] Contact us at {email}.",
+            f"Contact us at {spoken_email}."
+        ))
+        examples.append(make(
+            f"[pl] Napisz do nas na {email}.",
+            f"Napisz do nas na {spoken_email}."
+        ))
+
+    return examples
+
+
+def gen_symbols(count=60):
+    """Symbol expansion."""
+    symbol_map_en = {
+        '@': 'at', '&': 'and', '#': 'hash', '%': 'percent',
+        '+': 'plus', '=': 'equals', '/': 'slash', '\\': 'backslash',
+        '*': 'asterisk', '^': 'caret', '~': 'tilde', '|': 'pipe',
+        '<': 'less than', '>': 'greater than',
+    }
+    symbol_map_pl = {
+        '@': 'małpa', '&': 'i', '#': 'hash', '%': 'procent',
+        '+': 'plus', '=': 'równa się', '/': 'slash', '\\': 'backslash',
+        '*': 'gwiazdka', '^': 'daszek', '~': 'tylda', '|': 'kreska pionowa',
+        '<': 'mniejsze niż', '>': 'większe niż',
+    }
+
+    examples = []
+    for _ in range(count):
+        sym = random.choice(list(symbol_map_en.keys()))
+        examples.append(make(
+            f"[en] Press {sym} to continue.",
+            f"Press {symbol_map_en[sym]} to continue."
+        ))
+        examples.append(make(
+            f"[pl] Naciśnij {sym} aby kontynuować.",
+            f"Naciśnij {symbol_map_pl[sym]} aby kontynuować."
+        ))
+
+    # Math expressions
+    for _ in range(20):
+        a = random.randint(1, 100)
+        b = random.randint(1, 100)
+        op = random.choice(['+', '-', '*', '/'])
+        op_en = {'+': 'plus', '-': 'minus', '*': 'times', '/': 'divided by'}
+        op_pl = {'+': 'plus', '-': 'minus', '*': 'razy', '/': 'podzielone przez'}
+
+        examples.append(make(
+            f"[en] Calculate {a} {op} {b}.",
+            f"Calculate {n2w(a, 'en')} {op_en[op]} {n2w(b, 'en')}."
+        ))
+        examples.append(make(
+            f"[pl] Oblicz {a} {op} {b}.",
+            f"Oblicz {n2w(a, 'pl')} {op_pl[op]} {n2w(b, 'pl')}."
+        ))
+
+    return examples
+
+
+def gen_markdown(count=60):
+    """Strip markdown formatting."""
+    examples = []
+    md_pairs_en = [
+        ("**important** update", "important update"),
+        ("the *quick* brown fox", "the quick brown fox"),
+        ("# Main Heading", "Main Heading"),
+        ("## Section Two", "Section Two"),
+        ("### Subsection", "Subsection"),
+        ("Check `config.json` file", "Check config dot json file"),
+        ("Use ```python\nprint()``` here", "Use print here"),
+        ("Visit [our site](https://example.com)", "Visit our site"),
+        ("- item one\n- item two", "item one, item two"),
+        ("1. first step\n2. second step", "first step, second step"),
+        ("> This is a quote", "This is a quote"),
+        ("~~deleted~~ text", "deleted text"),
+        ("__underline__ this", "underline this"),
+        ("***bold italic*** text", "bold italic text"),
+        ("some `inline code` here", "some inline code here"),
+    ]
+    md_pairs_pl = [
+        ("**ważna** aktualizacja", "ważna aktualizacja"),
+        ("*szybki* brązowy lis", "szybki brązowy lis"),
+        ("# Główny Nagłówek", "Główny Nagłówek"),
+        ("## Sekcja Druga", "Sekcja Druga"),
+        ("Sprawdź plik `config.json`", "Sprawdź plik config dot json"),
+        ("Odwiedź [naszą stronę](https://example.com)", "Odwiedź naszą stronę"),
+        ("- element pierwszy\n- element drugi", "element pierwszy, element drugi"),
+        ("1. pierwszy krok\n2. drugi krok", "pierwszy krok, drugi krok"),
+        ("> To jest cytat", "To jest cytat"),
+        ("~~usunięty~~ tekst", "usunięty tekst"),
+        ("***pogrubiona kursywa*** tekst", "pogrubiona kursywa tekst"),
+    ]
+
+    for _ in range(count):
+        raw, norm = random.choice(md_pairs_en)
+        examples.append(make(f"[en] {raw}", norm))
+
+        raw, norm = random.choice(md_pairs_pl)
+        examples.append(make(f"[pl] {raw}", norm))
+
+    return examples
+
+
+def gen_phone_numbers(count=40):
+    """Phone number normalization — read as digit groups."""
+    examples = []
+    for _ in range(count):
+        # Polish phone: +48 XXX XXX XXX
+        g1 = random.randint(100, 999)
+        g2 = random.randint(100, 999)
+        g3 = random.randint(100, 999)
+        phone_pl = f"+48 {g1} {g2} {g3}"
+        spoken_digits = lambda n, lang: ' '.join(n2w(int(d), lang) for d in str(n))
+        pl_spoken = f"plus czterdzieści osiem, {spoken_digits(g1, 'pl')}, {spoken_digits(g2, 'pl')}, {spoken_digits(g3, 'pl')}"
+        en_spoken = f"plus forty-eight, {spoken_digits(g1, 'en')}, {spoken_digits(g2, 'en')}, {spoken_digits(g3, 'en')}"
+
+        examples.append(make(
+            f"[pl] Zadzwoń pod {phone_pl}.",
+            f"Zadzwoń pod {pl_spoken}."
+        ))
+        examples.append(make(
+            f"[en] Call {phone_pl}.",
+            f"Call {en_spoken}."
+        ))
+
+        # US phone: (XXX) XXX-XXXX
+        a = random.randint(200, 999)
+        b = random.randint(100, 999)
+        c = random.randint(1000, 9999)
+        phone_us = f"({a}) {b}-{c}"
+        en_us = f"{spoken_digits(a, 'en')}, {spoken_digits(b, 'en')}, {spoken_digits(c, 'en')}"
+
+        examples.append(make(
+            f"[en] Reach us at {phone_us}.",
+            f"Reach us at {en_us}."
+        ))
+
+    return examples
+
+
+def gen_versions_ips(count=40):
+    """Version numbers and IP addresses."""
+    examples = []
+    for _ in range(count):
+        # Version: v3.2.1
+        major = random.randint(0, 20)
+        minor = random.randint(0, 30)
+        patch = random.randint(0, 99)
+        ver = f"v{major}.{minor}.{patch}"
+        en_ver = f"version {n2w(major, 'en')} point {n2w(minor, 'en')} point {n2w(patch, 'en')}"
+        pl_ver = f"wersja {n2w(major, 'pl')} kropka {n2w(minor, 'pl')} kropka {n2w(patch, 'pl')}"
+
+        examples.append(make(
+            f"[en] Upgrade to {ver}.",
+            f"Upgrade to {en_ver}."
+        ))
+        examples.append(make(
+            f"[pl] Zaktualizuj do {ver}.",
+            f"Zaktualizuj do {pl_ver}."
+        ))
+
+        # IP: 192.168.1.100
+        octets = [random.randint(0, 255) for _ in range(4)]
+        ip = '.'.join(str(o) for o in octets)
+        en_ip = ' dot '.join(n2w(o, 'en') for o in octets)
+        pl_ip = ' kropka '.join(n2w(o, 'pl') for o in octets)
+
+        examples.append(make(
+            f"[en] Connect to {ip}.",
+            f"Connect to {en_ip}."
+        ))
+        examples.append(make(
+            f"[pl] Połącz się z {ip}.",
+            f"Połącz się z {pl_ip}."
+        ))
+
+    return examples
+
+
+def gen_passthrough(count=80):
+    """Clean text that should pass through unchanged — negative examples."""
+    clean_en = [
+        "Hello, how are you doing today?",
+        "The quick brown fox jumps over the lazy dog.",
+        "Please close the door when you leave.",
+        "She walked slowly through the garden.",
+        "The meeting went well yesterday.",
+        "I think we should reconsider the approach.",
+        "Thank you for your help with this project.",
+        "The weather is beautiful this morning.",
+        "Let me know if you need anything else.",
+        "We appreciate your continued support.",
+        "The results exceeded our expectations.",
+        "Can you send me the updated report?",
+        "The new design looks great.",
+        "I'll follow up with you next week.",
+        "The system is running smoothly now.",
+    ]
+    clean_pl = [
+        "Cześć, jak się masz?",
+        "Szybki brązowy lis przeskakuje nad leniwym psem.",
+        "Proszę zamknąć drzwi, gdy wychodzisz.",
+        "Szła powoli przez ogród.",
+        "Spotkanie przebiegło dobrze wczoraj.",
+        "Myślę, że powinniśmy ponownie rozważyć podejście.",
+        "Dziękuję za pomoc przy tym projekcie.",
+        "Pogoda jest piękna tego ranka.",
+        "Daj mi znać, jeśli potrzebujesz czegoś jeszcze.",
+        "Doceniamy wasze ciągłe wsparcie.",
+        "Wyniki przekroczyły nasze oczekiwania.",
+        "Czy możesz wysłać mi zaktualizowany raport?",
+        "Nowy projekt wygląda świetnie.",
+        "Skontaktuję się z tobą w przyszłym tygodniu.",
+        "System działa teraz płynnie.",
+    ]
+
+    examples = []
+    for _ in range(count):
+        text = random.choice(clean_en)
+        examples.append(make(f"[en] {text}", text))
+        text = random.choice(clean_pl)
+        examples.append(make(f"[pl] {text}", text))
+
+    return examples
+
+
+def gen_mixed(count=150):
+    """Complex mixed examples with multiple normalization needs."""
+    examples = []
+
+    mixed_en = [
+        (
+            "[en] The CPU usage hit 95% at 14:30, costing us $4,500 in SLA penalties.",
+            "The C P U usage hit ninety-five percent at fourteen thirty, costing us four thousand five hundred dollars in S L A penalties."
+        ),
+        (
+            "[en] **WARNING**: Server at 192.168.1.42 returned HTTP 503 error.",
+            "WARNING: Server at one hundred ninety-two dot one hundred sixty-eight dot one dot forty-two returned H T T P five hundred three error."
+        ),
+        (
+            "[en] Send the PDF to john@example.com by 05/30/2026.",
+            "Send the P D F to john at example dot com by May thirtieth, twenty twenty-six."
+        ),
+        (
+            "[en] The NASA rover traveled 3.7km at 0.5km/h on Mars.",
+            "The NASA rover traveled three point seven kilometers at zero point five kilometers per hour on Mars."
+        ),
+        (
+            "[en] Update to v2.4.1 — fixes #347 & improves GPU performance by 12%.",
+            "Update to version two point four point one — fixes number three hundred forty-seven and improves G P U performance by twelve percent."
+        ),
+        (
+            "[en] The FBI & CIA issued a joint FAQ about the VPN breach affecting 10,000+ users.",
+            "The F B I and C I A issued a joint F A Q about the V P N breach affecting ten thousand plus users."
+        ),
+        (
+            "[en] Meeting at 3:00 PM in room 401. Budget: €250,000. Contact HR ASAP.",
+            "Meeting at three PM in room four hundred one. Budget: two hundred fifty thousand euros. Contact H R A S A P."
+        ),
+        (
+            "[en] The LED display shows -5°C. The ATM is 200m away on 3rd street.",
+            "The L E D display shows minus five degrees Celsius. The A T M is two hundred meters away on third street."
+        ),
+        (
+            "[en] Check https://docs.python.org/api/v3 for the SDK documentation.",
+            "Check docs dot python dot org slash api slash v three for the S D K documentation."
+        ),
+        (
+            "[en] Flight BA247 departs at 08:15 from gate 12B. ETA: 11:30.",
+            "Flight B A two forty-seven departs at eight fifteen from gate twelve B. E T A: eleven thirty."
+        ),
+        (
+            "[en] The IoT device uses 2.4GHz WiFi & draws 5W at 12V DC.",
+            "The I o T device uses two point four gigahertz WiFi and draws five watts at twelve volts D C."
+        ),
+        (
+            "[en] She recieved the USB drive with 256GB of ML training data.",
+            "She received the U S B drive with two hundred fifty-six gigabytes of M L training data."
+        ),
+        (
+            "[en] Callsign: Tango Alpha Foxtrot. Coordinates: 52°N, 21°E.",
+            "Callsign: T A F. Coordinates: fifty-two degrees north, twenty-one degrees east."
+        ),
+    ]
+
+    mixed_pl = [
+        (
+            "[pl] Użycie CPU osiągnęło 95% o 14:30, kosztując nas $4,500 kar SLA.",
+            "Użycie C P U osiągnęło dziewięćdziesiąt pięć procent o czternasta trzydzieści, kosztując nas cztery tysiące pięćset dolarów kar S L A."
+        ),
+        (
+            "[pl] **UWAGA**: Serwer 192.168.1.42 zwrócił błąd HTTP 503.",
+            "UWAGA: Serwer sto dziewięćdziesiąt dwa kropka sto sześćdziesiąt osiem kropka jeden kropka czterdzieści dwa zwrócił błąd H T T P pięćset trzy."
+        ),
+        (
+            "[pl] Wyślij PDF na kontakt@firma.pl do 30.05.2026.",
+            "Wyślij P D F na kontakt małpa firma dot P L do trzydziestego maja dwa tysiące dwudziestego szóstego."
+        ),
+        (
+            "[pl] Łazik NASA przejechał 3,7km z prędkością 0,5km/h na Marsie.",
+            "Łazik NASA przejechał trzy przecinek siedem kilometrów z prędkością zero przecinek pięć kilometrów na godzinę na Marsie."
+        ),
+        (
+            "[pl] Aktualizacja do v2.4.1 — naprawia #347 & poprawia wydajność GPU o 12%.",
+            "Aktualizacja do wersja dwa kropka cztery kropka jeden — naprawia numer trzysta czterdzieści siedem i poprawia wydajność G P U o dwanaście procent."
+        ),
+        (
+            "[pl] ABW i CBA wydały wspólne FAQ o naruszeniu VPN dotyczącym 10000+ użytkowników.",
+            "A B W i C B A wydały wspólne F A Q o naruszeniu V P N dotyczącym dziesięć tysięcy plus użytkowników."
+        ),
+        (
+            "[pl] Spotkanie o 15:00 w pokoju 401. Budżet: €250000. Skontaktuj się z HR jak najszybciej.",
+            "Spotkanie o piętnaście zero zero w pokoju czterysta jeden. Budżet: dwieście pięćdziesiąt tysięcy euro. Skontaktuj się z H R jak najszybciej."
+        ),
+        (
+            "[pl] Wyświetlacz LED pokazuje -5°C. Bankomat jest 200m dalej na 3. ulicy.",
+            "Wyświetlacz L E D pokazuje minus pięć stopni Celsjusza. Bankomat jest dwieście metrów dalej na trzeciej ulicy."
+        ),
+        (
+            "[pl] Sprawdź https://docs.python.org/api/v3 po dokumentację SDK.",
+            "Sprawdź docs dot python dot org slash api slash v trzy po dokumentację S D K."
+        ),
+        (
+            "[pl] Lot BA247 odlatuje o 08:15 z bramki 12B. Przewidywany czas: 11:30.",
+            "Lot B A dwieście czterdzieści siedem odlatuje o osiem piętnaście z bramki dwanaście B. Przewidywany czas: jedenaście trzydzieści."
+        ),
+        (
+            "[pl] Urządzenie IoT używa WiFi 2,4GHz i pobiera 5W przy 12V DC.",
+            "Urządzenie I o T używa WiFi dwa przecinek cztery gigaherców i pobiera pięć watów przy dwanaście woltów D C."
+        ),
+        (
+            "[pl] Kupiła pendrive USB z 256GB danych treningowych ML za 149 zł.",
+            "Kupiła pendrive U S B z dwieście pięćdziesiąt sześć gigabajtów danych treningowych M L za sto czterdzieści dziewięć złotych."
+        ),
+        (
+            "[pl] Znak wywoławczy: Tango Alfa Foxtrot. Współrzędne: 52°N, 21°E.",
+            "Znak wywoławczy: T A F. Współrzędne: pięćdziesiąt dwa stopnie północ, dwadzieścia jeden stopni wschód."
+        ),
+    ]
+
+    for pair in mixed_en:
+        examples.append(make(pair[0], pair[1]))
+    for pair in mixed_pl:
+        examples.append(make(pair[0], pair[1]))
+
+    # Generate more mixed by combining elements
+    for _ in range(count - len(mixed_en) - len(mixed_pl)):
+        n = random.randint(100, 9999)
+        acr = random.choice(list(SPELLED_ACRONYMS.keys()))
+        acr_sp = SPELLED_ACRONYMS[acr]
+        sym = random.choice(['$', '€', 'zł', '£'])
+        amount = random.randint(10, 99999)
+
+        if random.random() < 0.5:
+            cur_raw = f"{sym}{amount:,}" if sym in {'$', '€', '£'} else f"{amount:,} {sym}"
+            examples.append(make(
+                f"[en] The {acr} report shows {n} entries totaling {cur_raw}.",
+                f"The {acr_sp} report shows {n2w(n, 'en')} entries totaling {fmt_currency(amount, sym, 'en')}."
+            ))
+        else:
+            cur_raw = f"{sym}{amount:,}" if sym in {'$', '€', '£'} else f"{amount:,} {sym}"
+            examples.append(make(
+                f"[pl] Raport {acr} pokazuje {n} wpisów na łącznie {cur_raw}.",
+                f"Raport {acr_sp} pokazuje {n2w(n, 'pl')} wpisów na łącznie {fmt_currency(amount, sym, 'pl')}."
+            ))
+
+    return examples
+
+
+def gen_abbreviations(count=40):
+    """Common abbreviations."""
+    abbrevs_en = {
+        'Mr.': 'Mister', 'Mrs.': 'Misses', 'Dr.': 'Doctor',
+        'St.': 'Saint', 'Ave.': 'Avenue', 'Blvd.': 'Boulevard',
+        'Jr.': 'Junior', 'Sr.': 'Senior', 'Prof.': 'Professor',
+        'vs.': 'versus', 'etc.': 'et cetera', 'e.g.': 'for example',
+        'i.e.': 'that is', 'approx.': 'approximately',
+        'dept.': 'department', 'govt.': 'government',
+        'inc.': 'incorporated', 'corp.': 'corporation',
+    }
+    abbrevs_pl = {
+        'dr': 'doktor', 'prof.': 'profesor', 'mgr': 'magister',
+        'inż.': 'inżynier', 'ul.': 'ulica', 'al.': 'aleja',
+        'pl.': 'plac', 'os.': 'osiedle', 'nr': 'numer',
+        'tel.': 'telefon', 'godz.': 'godzina', 'ok.': 'około',
+        'np.': 'na przykład', 'tj.': 'to jest', 'itd.': 'i tak dalej',
+        'itp.': 'i tym podobne', 'wg': 'według', 'dot.': 'dotyczący',
+    }
+
+    examples = []
+    for _ in range(count):
+        abbr, full = random.choice(list(abbrevs_en.items()))
+        examples.append(make(
+            f"[en] {abbr} Smith will arrive at 5.",
+            f"{full} Smith will arrive at five."
+        ))
+
+        abbr, full = random.choice(list(abbrevs_pl.items()))
+        examples.append(make(
+            f"[pl] {abbr} Kowalski przyjedzie o 5.",
+            f"{full} Kowalski przyjedzie o piąta."
+        ))
+
+    return examples
+
+
+def gen_contractions(count=40):
+    """English contractions — normalize for TTS clarity."""
+    contractions = {
+        "don't": "do not", "doesn't": "does not", "didn't": "did not",
+        "won't": "will not", "wouldn't": "would not", "couldn't": "could not",
+        "shouldn't": "should not", "can't": "cannot", "isn't": "is not",
+        "aren't": "are not", "wasn't": "was not", "weren't": "were not",
+        "hasn't": "has not", "haven't": "have not", "hadn't": "had not",
+        "I'm": "I am", "I've": "I have", "I'll": "I will", "I'd": "I would",
+        "you're": "you are", "you've": "you have", "you'll": "you will",
+        "he's": "he is", "she's": "she is", "it's": "it is",
+        "we're": "we are", "we've": "we have", "we'll": "we will",
+        "they're": "they are", "they've": "they have", "they'll": "they will",
+        "that's": "that is", "there's": "there is", "here's": "here is",
+        "let's": "let us", "who's": "who is", "what's": "what is",
+    }
+
+    templates = [
+        "{} the right approach.",
+        "I think {} a good idea.",
+        "{} going to work out.",
+        "They said {} ready yet.",
+        "{} what we expected.",
+    ]
+
+    examples = []
+    for _ in range(count):
+        contr, expanded = random.choice(list(contractions.items()))
+        tpl = random.choice(templates)
+        examples.append(make(
+            f"[en] {tpl.format(contr)}",
+            tpl.format(expanded)
+        ))
+
+    return examples
+
+
+def gen_large_numbers_shorthand(count=40):
+    """Large number shorthands: 4.5M, 2.3B, 1.2K etc."""
+    suffixes = {
+        'K': (1_000, 'thousand', 'tysięcy'),
+        'M': (1_000_000, 'million', 'milionów'),
+        'B': (1_000_000_000, 'billion', 'miliardów'),
+    }
+
+    examples = []
+    for _ in range(count):
+        suffix, (mult, en_word, pl_word) = random.choice(list(suffixes.items()))
+        whole = random.randint(1, 99)
+        frac = random.choice([0, random.randint(1, 9)])
+
+        if frac:
+            raw = f"{whole}.{frac}{suffix}"
+            en_spoken = f"{n2w(whole, 'en')} point {n2w(frac, 'en')} {en_word}"
+            pl_spoken = f"{n2w(whole, 'pl')} przecinek {n2w(frac, 'pl')} {pl_word}"
+        else:
+            raw = f"{whole}{suffix}"
+            en_spoken = f"{n2w(whole, 'en')} {en_word}"
+            pl_spoken = f"{n2w(whole, 'pl')} {pl_word}"
+
+        for sym in ['$', '€', 'zł']:
+            if sym in {'$', '€'}:
+                cur_raw = f"{sym}{raw}"
+            else:
+                cur_raw = f"{raw} {sym}"
+
+            if frac:
+                val_for_cur = whole  # approximate — the model should learn the pattern
+            else:
+                val_for_cur = whole
+
+            examples.append(make(
+                f"[en] Revenue reached {cur_raw}.",
+                f"Revenue reached {en_spoken} {en_currency_name(sym, 2)}."
+            ))
+            examples.append(make(
+                f"[pl] Przychody osiągnęły {cur_raw}.",
+                f"Przychody osiągnęły {pl_spoken} {pl_currency_name(sym, 5)}."
+            ))
+
+    return examples
+
+
+# ──────────────────────────────────────────────────────────────
+# MAIN
+# ──────────────────────────────────────────────────────────────
+
+def main():
+    random.seed(SEED)
+    all_examples = []
+
+    generators = [
+        ("numbers", gen_numbers, 200),
+        ("negatives_decimals", gen_negatives_decimals, 80),
+        ("ordinals", gen_ordinals, 80),
+        ("percentages", gen_percentages, 60),
+        ("dates", gen_dates, 100),
+        ("times", gen_times, 80),
+        ("currency", gen_currency, 120),
+        ("units", gen_units, 80),
+        ("temperatures", gen_temperatures, 40),
+        ("acronyms", gen_acronyms, 150),
+        ("nato", gen_nato, 100),
+        ("spelling", gen_spelling_corrections, 80),
+        ("urls_emails", gen_urls_emails, 80),
+        ("symbols", gen_symbols, 60),
+        ("markdown", gen_markdown, 60),
+        ("phone_numbers", gen_phone_numbers, 40),
+        ("versions_ips", gen_versions_ips, 40),
+        ("passthrough", gen_passthrough, 80),
+        ("mixed", gen_mixed, 150),
+        ("abbreviations", gen_abbreviations, 40),
+        ("contractions", gen_contractions, 40),
+        ("large_numbers", gen_large_numbers_shorthand, 40),
+    ]
+
+    for name, gen_fn, count in generators:
+        examples = gen_fn(count)
+        print(f"  {name}: {len(examples)} examples")
+        all_examples.extend(examples)
+
+    random.shuffle(all_examples)
+
+    with OUTPUT.open("w", encoding="utf-8") as f:
+        for ex in all_examples:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+
+    print(f"\nTotal: {len(all_examples)} examples -> {OUTPUT}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/train_tts_norm.py b/train_tts_norm.py
new file mode 100644
index 0000000..75736c0
--- /dev/null
+++ b/train_tts_norm.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""Train TTS normalization LoRA on Qwen2.5-7B-Instruct using Unsloth.
+
+Reads: tts_norm_dataset.jsonl (ShareGPT format)
+Output: tts-norm-lora/ adapter (vLLM-compatible)
+
+Run on junkpile — RTX 2000 Ada 16GB.
+"""
+
+import os
+import torch
+from pathlib import Path
+from unsloth import FastLanguageModel
+from unsloth.chat_templates import get_chat_template, standardize_sharegpt
+from trl import SFTTrainer
+from transformers import TrainingArguments
+from datasets import load_dataset
+
+# ──────────────────────────────────────────────────────────────
+# CONFIG
+# ──────────────────────────────────────────────────────────────
+
+MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
+DATASET_PATH = "tts_norm_dataset.jsonl"
+OUTPUT_DIR = "./tts-norm-lora"
+MAX_SEQ_LEN = 768         # TTS normalization is short text
+LORA_RANK = 16
+LORA_ALPHA = 16
+BATCH_SIZE = 2             # small GPU — use grad accumulation
+GRAD_ACCUM = 8             # effective batch = 16
+EPOCHS = 3
+LR = 2e-4
+WARMUP_STEPS = 30
+SAVE_STEPS = 100
+LOGGING_STEPS = 10
+SEED = 42
+
+# ──────────────────────────────────────────────────────────────
+# LOAD MODEL
+# ──────────────────────────────────────────────────────────────
+
+print(f"Loading {MODEL_NAME}...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=MODEL_NAME,
+    max_seq_length=MAX_SEQ_LEN,
+    load_in_4bit=True,
+    dtype=None,  # auto-detect
+)
+
+# Apply chat template
+tokenizer = get_chat_template(
+    tokenizer,
+    chat_template="qwen-2.5",
+)
+
+# ──────────────────────────────────────────────────────────────
+# PEFT CONFIG
+# ──────────────────────────────────────────────────────────────
+
+print("Applying LoRA...")
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=LORA_RANK,
+    lora_alpha=LORA_ALPHA,
+    target_modules=[
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+    ],
+    lora_dropout=0,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=SEED,
+)
+
+# ──────────────────────────────────────────────────────────────
+# DATASET
+# ──────────────────────────────────────────────────────────────
+
+print(f"Loading dataset from {DATASET_PATH}...")
+dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
+print(f"  {len(dataset)} examples loaded")
+
+# Standardize to ShareGPT format (handles from/value vs role/content)
+dataset = standardize_sharegpt(dataset)
+
+# Pre-apply chat template via map — avoids formatting_func signature issues
+def apply_template(examples):
+    """Apply Qwen2.5 chat template to conversations."""
+    convos = examples["conversations"]
+    texts = []
+    for convo in convos:
+        text = tokenizer.apply_chat_template(
+            convo,
+            tokenize=False,
+            add_generation_prompt=False,
+        )
+        texts.append(text)
+    return {"text": texts}
+
+print("Applying chat template...")
+dataset = dataset.map(apply_template, batched=True, num_proc=2)
+
+# ──────────────────────────────────────────────────────────────
+# TRAINER
+# ──────────────────────────────────────────────────────────────
+
+print("Setting up trainer...")
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    args=TrainingArguments(
+        output_dir=OUTPUT_DIR,
+        per_device_train_batch_size=BATCH_SIZE,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        num_train_epochs=EPOCHS,
+        learning_rate=LR,
+        lr_scheduler_type="cosine",
+        warmup_steps=WARMUP_STEPS,
+        fp16=not torch.cuda.is_bf16_supported(),
+        bf16=torch.cuda.is_bf16_supported(),
+        logging_steps=LOGGING_STEPS,
+        save_steps=SAVE_STEPS,
+        save_total_limit=3,
+        seed=SEED,
+        optim="adamw_8bit",
+        weight_decay=0.01,
+        max_grad_norm=1.0,
+        report_to="none",
+        dataloader_num_workers=2,
+    ),
+    max_seq_length=MAX_SEQ_LEN,
+    dataset_num_proc=2,
+    packing=True,  # pack short examples for efficiency
+)
+
+# ──────────────────────────────────────────────────────────────
+# TRAIN
+# ──────────────────────────────────────────────────────────────
+
+print("Starting training...")
+stats = trainer.train()
+print(f"\nTraining complete!")
+print(f"  Total steps: {stats.global_step}")
+print(f"  Train loss: {stats.training_loss:.4f}")
+print(f"  Runtime: {stats.metrics['train_runtime']:.0f}s")
+
+# ──────────────────────────────────────────────────────────────
+# SAVE ADAPTER
+# ──────────────────────────────────────────────────────────────
+
+print(f"\nSaving adapter to {OUTPUT_DIR}...")
+model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)
+
+# Verify
+adapter_path = Path(OUTPUT_DIR) / "adapter_model.safetensors"
+if adapter_path.exists():
+    size_mb = adapter_path.stat().st_size / (1024 * 1024)
+    print(f"  Adapter saved: {size_mb:.1f} MB")
+else:
+    print("  WARNING: adapter_model.safetensors not found!")
+
+config_path = Path(OUTPUT_DIR) / "adapter_config.json"
+if config_path.exists():
+    print(f"  Config saved: {config_path}")
+
+print(f"\nDone. Serve with:")
+print(f"  vllm serve Qwen/Qwen2.5-7B-Instruct \\")
+print(f"    --enable-lora \\")
+print(f"    --lora-modules tts-norm={os.path.abspath(OUTPUT_DIR)} \\")
+print(f"    --max-lora-rank {LORA_RANK}")