Files
lora/gen_tts_dataset.py
marauder-actual 122e73860b feat: tts-norm LoRA — dataset generator + training script
gen_tts_dataset.py: 4960 synthetic examples, 22 categories (numbers,
currencies, dates, times, temperatures, acronyms, NATO phonetic, URLs,
markdown, etc). Bilingual EN/PL with explicit [lang] tag prefix.

train_tts_norm.py: Unsloth LoRA training for Qwen2.5-7B-Instruct.
Rank 16, 3 epochs, packing, max_seq 768. Trained on H100 in 20m38s,
final loss 0.091. Adapter: 154MB.
2026-05-26 00:14:51 +02:00

1353 lines
51 KiB
Python

#!/usr/bin/env python3
"""Generate TTS normalization LoRA training dataset.
Language is EXPLICITLY tagged [en] or [pl] by the TTS client — no detection/inference.
Currency, units, and content are language-independent (Polish text can have dollars,
English text can have złoty).
Output: JSONL in ShareGPT conversation format for Unsloth/Qwen2.5-Instruct.
"""
import json
import random
import re
from pathlib import Path
from num2words import num2words
SEED = 42
OUTPUT = Path("tts_norm_dataset.jsonl")
SYSTEM = (
"You are a TTS text preprocessor. The input begins with a language tag [en] or [pl] "
"that specifies the target speech language. Normalize the text for natural speech synthesis: "
"expand numbers to words in the tagged language, handle acronyms (spell out letter-acronyms "
"with spaces between letters, keep pronounceable acronyms intact), fix spelling errors, "
"convert symbols and URLs to spoken form, and strip markdown formatting. "
"Output only the normalized text without the language tag."
)
# ──────────────────────────────────────────────────────────────
# NATO PHONETIC ALPHABET
# ──────────────────────────────────────────────────────────────
NATO = {
'A': 'Alfa', 'B': 'Bravo', 'C': 'Charlie', 'D': 'Delta',
'E': 'Echo', 'F': 'Foxtrot', 'G': 'Golf', 'H': 'Hotel',
'I': 'India', 'J': 'Juliett', 'K': 'Kilo', 'L': 'Lima',
'M': 'Mike', 'N': 'November', 'O': 'Oscar', 'P': 'Papa',
'Q': 'Quebec', 'R': 'Romeo', 'S': 'Sierra', 'T': 'Tango',
'U': 'Uniform', 'V': 'Victor', 'W': 'Whiskey', 'X': 'X-ray',
'Y': 'Yankee', 'Z': 'Zulu',
}
# ──────────────────────────────────────────────────────────────
# ACRONYMS — spelled letter-by-letter
# ──────────────────────────────────────────────────────────────
SPELLED_ACRONYMS = {
'API': 'A P I', 'CPU': 'C P U', 'GPU': 'G P U', 'RAM': 'R A M',
'SSD': 'S S D', 'HTTP': 'H T T P', 'HTTPS': 'H T T P S',
'URL': 'U R L', 'HTML': 'H T M L', 'CSS': 'C S S',
'FBI': 'F B I', 'CIA': 'C I A', 'NFL': 'N F L', 'NBA': 'N B A',
'MVP': 'M V P', 'CEO': 'C E O', 'CFO': 'C F O', 'CTO': 'C T O',
'AI': 'A I', 'ML': 'M L', 'OS': 'O S', 'UI': 'U I', 'UX': 'U X',
'IP': 'I P', 'VPN': 'V P N', 'DNS': 'D N S', 'SSH': 'S S H',
'FTP': 'F T P', 'SQL': 'S Q L', 'XML': 'X M L', 'JSON': 'J S O N',
'AWS': 'A W S', 'USB': 'U S B', 'HDMI': 'H D M I',
'LCD': 'L C D', 'LED': 'L E D', 'PDF': 'P D F', 'FAQ': 'F A Q',
'DIY': 'D I Y', 'ETA': 'E T A', 'FYI': 'F Y I', 'ASAP': 'A S A P',
'GPS': 'G P S', 'ATM': 'A T M', 'BBC': 'B B C', 'CNN': 'C N N',
'EU': 'E U', 'UN': 'U N', 'UK': 'U K', 'US': 'U S',
'RGB': 'R G B', 'TCP': 'T C P', 'UDP': 'U D P',
'IDE': 'I D E', 'SDK': 'S D K', 'CI': 'C I', 'CD': 'C D',
'PR': 'P R', 'QA': 'Q A', 'IT': 'I T', 'HR': 'H R',
'PKP': 'P K P', 'PZU': 'P Z U', 'ZUS': 'Z U S', 'NFZ': 'N F Z',
'PKO': 'P K O', 'TVP': 'T V P', 'TVN': 'T V N',
'KRS': 'K R S', 'NIP': 'N I P', 'VAT': 'V A T',
'ZTM': 'Z T M', 'MPK': 'M P K', 'GUS': 'G U S', 'PGE': 'P G E',
'NBP': 'N B P', 'PKB': 'P K B', 'UE': 'U E', 'ONZ': 'O N Z',
'RPO': 'R P O', 'NIK': 'N I K', 'SLD': 'S L D',
'PIS': 'P I S', 'PSL': 'P S L', 'PCK': 'P C K',
'AGH': 'A G H', 'UJ': 'U J', 'UW': 'U W', 'PWN': 'P W N',
'IPN': 'I P N', 'ABW': 'A B W', 'CBA': 'C B A',
'PIT': 'P I T', 'CIT': 'C I T', 'PESEL': 'P E S E L',
'TTS': 'T T S', 'NPC': 'N P C', 'RPG': 'R P G', 'PVP': 'P V P',
'DPS': 'D P S', 'MMO': 'M M O', 'RNG': 'R N G', 'AFK': 'A F K',
'LLM': 'L L M', 'NLP': 'N L P', 'OCR': 'O C R', 'ORM': 'O R M',
}
# Acronyms pronounced as words (not spelled)
WORD_ACRONYMS = [
'NASA', 'NATO', 'LASER', 'RADAR', 'SCUBA', 'PIN', 'SIM', 'BIOS',
'CAPTCHA', 'AWOL', 'GIF', 'JPEG', 'GOPR', 'IKEA', 'FIAT',
'UNICEF', 'UNESCO', 'AIDS', 'COVID', 'BASIC', 'SWIFT', 'DART',
'RUST', 'AJAX', 'LIDAR', 'MODEM', 'PIXEL',
]
# Words that LOOK like acronyms (all-caps) but are just regular words
NOT_ACRONYMS = [
'NVIDIA', 'HELLO', 'STOP', 'WARNING', 'ERROR', 'DANGER',
'IMPORTANT', 'NOTE', 'URGENT', 'ATTENTION', 'WELCOME', 'EXIT',
'OPEN', 'CLOSE', 'START', 'FINISH', 'UWAGA', 'STOP', 'WEJŚCIE',
'WYJŚCIE', 'ZAMKNIĘTE', 'OTWARTE',
]
# ──────────────────────────────────────────────────────────────
# SPELLING ERRORS
# ──────────────────────────────────────────────────────────────
TYPOS_EN = {
'recieve': 'receive', 'definately': 'definitely', 'occured': 'occurred',
'seperate': 'separate', 'accomodate': 'accommodate', 'neccessary': 'necessary',
'wierd': 'weird', 'occassion': 'occasion', 'concious': 'conscious',
'enviroment': 'environment', 'goverment': 'government', 'independant': 'independent',
'knowlege': 'knowledge', 'langauge': 'language', 'maintainance': 'maintenance',
'millenium': 'millennium', 'noticable': 'noticeable', 'persistant': 'persistent',
'publically': 'publicly', 'recomend': 'recommend', 'refering': 'referring',
'successfull': 'successful', 'suprise': 'surprise', 'tommorow': 'tomorrow',
'untill': 'until', 'wether': 'whether', 'wich': 'which',
'thier': 'their', 'teh': 'the', 'adn': 'and', 'hte': 'the',
'becuase': 'because', 'beleive': 'believe', 'calender': 'calendar',
'collegue': 'colleague', 'comittee': 'committee', 'dissapoint': 'disappoint',
'embarass': 'embarrass', 'existance': 'existence', 'foriegn': 'foreign',
'gurantee': 'guarantee', 'harrass': 'harass', 'imediately': 'immediately',
'jewlery': 'jewelry', 'judgement': 'judgment', 'liason': 'liaison',
}
TYPOS_PL = {
'rząd': 'rząd', # correct, no-op example
'wziąść': 'wziąć', 'włanczyć': 'włączyć', 'poszłem': 'poszedłem',
'pokarze': 'pokaże', 'napewno': 'na pewno', 'wogóle': 'w ogóle',
'niewiem': 'nie wiem', 'przedewszystkim': 'przede wszystkim',
'conajmniej': 'co najmniej', 'natomist': 'natomiast',
'ponieważ': 'ponieważ', # correct
'żadko': 'rzadko', 'bynajmiej': 'bynajmniej',
'jakby': 'jakby', # correct
'pomóżcie': 'pomóżcie', # correct
'włożyłam': 'włożyłam', # correct
'spróbój': 'spróbuj', 'wyrzygnąć': 'wyrzucić',
'przyjeżdzać': 'przyjeżdżać', 'żółtko': 'żółtko', # correct
'gżegżółka': 'gżegżółka', # correct — hard word
'sprawdźić': 'sprawdzić', 'ząmówić': 'zamówić',
'orginalny': 'oryginalny', 'symultaniczny': 'symultaniczny',
'odzwyczaić': 'odzwyczaić', # correct
'czterysta': 'czterysta', # correct
'rzentelny': 'rzetelny', 'wchodzić': 'wchodzić', # correct
'porządże': 'porządze', # intentional trap
}
# ──────────────────────────────────────────────────────────────
# SENTENCE TEMPLATES (with {} placeholders)
# ──────────────────────────────────────────────────────────────
NUM_TEMPLATES_EN = [
"There are {} items in the queue.",
"The building has {} floors.",
"She scored {} points in the final round.",
"Approximately {} people attended the event.",
"{} units were shipped yesterday.",
"The file contains {} lines of code.",
"He waited {} minutes for the train.",
"The population reached {} last year.",
"We need {} more signatures.",
"The distance is {} meters.",
"Page {} of the document.",
"Chapter {} covers advanced topics.",
"Flight {} departs at noon.",
"Room {} is on the third floor.",
"The team completed {} sprints this quarter.",
]
NUM_TEMPLATES_PL = [
"W kolejce jest {} elementów.",
"Budynek ma {} pięter.",
"Zdobyła {} punktów w finale.",
"Na wydarzeniu pojawiło się około {} osób.",
"Wysłano {} jednostek wczoraj.",
"Plik zawiera {} linii kodu.",
"Czekał {} minut na pociąg.",
"Populacja osiągnęła {} w zeszłym roku.",
"Potrzebujemy jeszcze {} podpisów.",
"Odległość wynosi {} metrów.",
"Strona {} dokumentu.",
"Rozdział {} obejmuje zaawansowane tematy.",
"Lot {} odlatuje w południe.",
"Pokój {} jest na trzecim piętrze.",
"Zespół ukończył {} sprintów w tym kwartale.",
]
CURRENCY_TEMPLATES_EN = [
"The total is {}.",
"She paid {}.",
"The budget was set at {}.",
"It costs {} per unit.",
"They raised {} for charity.",
"The invoice shows {}.",
"Repairs will cost approximately {}.",
"The price dropped to {}.",
]
CURRENCY_TEMPLATES_PL = [
"Łącznie to {}.",
"Zapłaciła {}.",
"Budżet ustalono na {}.",
"Kosztuje {} za sztukę.",
"Zebrali {} na cele charytatywne.",
"Faktura pokazuje {}.",
"Naprawa będzie kosztować około {}.",
"Cena spadła do {}.",
]
ACRONYM_TEMPLATES_EN = [
"The {} system is down.",
"Check the {} settings.",
"We're migrating to {}.",
"The {} update is ready.",
"{} performance improved significantly.",
"Connect via {} to the server.",
"The {} team approved the request.",
]
ACRONYM_TEMPLATES_PL = [
"System {} nie działa.",
"Sprawdź ustawienia {}.",
"Migrujemy do {}.",
"Aktualizacja {} jest gotowa.",
"Wydajność {} znacząco się poprawiła.",
"Połącz się przez {} z serwerem.",
"Zespół {} zatwierdził wniosek.",
]
# ──────────────────────────────────────────────────────────────
# HELPERS
# ──────────────────────────────────────────────────────────────
def n2w(n, lang):
"""Number to words via num2words. Handles negative bug in PL."""
if isinstance(n, (int, float)) and n < 0:
return f"minus {num2words(abs(n), lang=lang)}"
return num2words(n, lang=lang)
def n2w_ordinal(n, lang):
"""Ordinal number to words."""
return num2words(n, to='ordinal', lang=lang)
def en_currency_name(symbol, n):
"""Currency name in English for any symbol."""
names = {
'$': ('dollar', 'dollars'),
'': ('euro', 'euros'),
'': ('zloty', 'zloty'),
'£': ('pound', 'pounds'),
'¥': ('yen', 'yen'),
'CHF': ('Swiss franc', 'Swiss francs'),
'kr': ('krona', 'kronor'),
}
sing, plur = names.get(symbol, (symbol, symbol))
return sing if abs(n) == 1 else plur
def pl_currency_name(symbol, n):
"""Currency name in Polish for any symbol — with declension."""
last_d = abs(n) % 10
last_2d = abs(n) % 100
def pick(one, few, many):
if abs(n) == 1:
return one
elif 2 <= last_d <= 4 and not (12 <= last_2d <= 14):
return few
else:
return many
table = {
'$': ('dolar', 'dolary', 'dolarów'),
'': ('euro', 'euro', 'euro'),
'': ('złoty', 'złote', 'złotych'),
'£': ('funt', 'funty', 'funtów'),
'¥': ('jen', 'jeny', 'jenów'),
'CHF': ('frank szwajcarski', 'franki szwajcarskie', 'franków szwajcarskich'),
'kr': ('korona', 'korony', 'koron'),
}
forms = table.get(symbol, (symbol, symbol, symbol))
return pick(*forms)
def fmt_currency(n, symbol, lang):
"""Format currency amount as spoken text."""
word_n = n2w(n, lang)
if lang == 'en':
return f"{word_n} {en_currency_name(symbol, n)}"
else:
return f"{word_n} {pl_currency_name(symbol, n)}"
def make(raw, norm):
"""Make one ShareGPT conversation example."""
return {
"conversations": [
{"from": "system", "value": SYSTEM},
{"from": "human", "value": raw},
{"from": "gpt", "value": norm},
]
}
# ──────────────────────────────────────────────────────────────
# GENERATORS
# ──────────────────────────────────────────────────────────────
def gen_numbers(count=200):
"""Cardinal numbers in sentence context."""
examples = []
ranges = [
(0, 20), (21, 99), (100, 999), (1000, 9999),
(10000, 99999), (100000, 999999),
]
for _ in range(count):
lo, hi = random.choice(ranges)
n = random.randint(lo, hi)
# English
tpl = random.choice(NUM_TEMPLATES_EN)
examples.append(make(
f"[en] {tpl.format(n)}",
tpl.format(n2w(n, 'en'))
))
# Polish
tpl = random.choice(NUM_TEMPLATES_PL)
examples.append(make(
f"[pl] {tpl.format(n)}",
tpl.format(n2w(n, 'pl'))
))
return examples
def gen_negatives_decimals(count=80):
"""Negative numbers and decimals."""
examples = []
for _ in range(count):
# Negative integers
n = -random.randint(1, 500)
examples.append(make(
f"[en] The temperature is {n} degrees.",
f"The temperature is {n2w(n, 'en')} degrees."
))
examples.append(make(
f"[pl] Temperatura wynosi {n} stopni.",
f"Temperatura wynosi {n2w(n, 'pl')} stopni."
))
# Decimals
whole = random.randint(0, 999)
frac = random.randint(1, 99)
dec_str = f"{whole}.{frac}"
dec_val = float(dec_str)
en_spoken = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in str(frac))}"
pl_spoken = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in str(frac))}"
examples.append(make(
f"[en] The measurement reads {dec_str}.",
f"The measurement reads {en_spoken}."
))
examples.append(make(
f"[pl] Pomiar wskazuje {dec_str}.",
f"Pomiar wskazuje {pl_spoken}."
))
return examples
def gen_ordinals(count=80):
"""Ordinal numbers."""
examples = []
for _ in range(count):
n = random.randint(1, 100)
suffix_raw = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
if 11 <= n % 100 <= 13:
suffix_raw = 'th'
examples.append(make(
f"[en] This is the {n}{suffix_raw} attempt.",
f"This is the {n2w_ordinal(n, 'en')} attempt."
))
examples.append(make(
f"[pl] To jest {n}. próba.",
f"To jest {n2w_ordinal(n, 'pl')} próba."
))
return examples
def gen_percentages(count=60):
"""Percentage expressions."""
examples = []
for _ in range(count):
n = random.choice([
random.randint(0, 100),
round(random.uniform(0, 100), 1),
])
if isinstance(n, float):
whole = int(n)
frac = str(n).split('.')[1]
en_w = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in frac)}"
pl_w = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in frac)}"
else:
en_w = n2w(n, 'en')
pl_w = n2w(n, 'pl')
examples.append(make(
f"[en] The success rate is {n}%.",
f"The success rate is {en_w} percent."
))
examples.append(make(
f"[pl] Wskaźnik sukcesu wynosi {n}%.",
f"Wskaźnik sukcesu wynosi {pl_w} procent."
))
return examples
def gen_dates(count=100):
"""Date expressions in various formats."""
examples = []
months_en = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
months_pl_gen = ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia']
for _ in range(count):
day = random.randint(1, 28)
month = random.randint(1, 12)
year = random.randint(1990, 2030)
# EN: "05/25/2026" or "25.05.2026" or "2026-05-25"
en_day_ord = n2w_ordinal(day, 'en')
en_year = n2w(year, 'en')
en_spoken = f"{months_en[month-1]} {en_day_ord}, {en_year}"
fmt = random.choice(['us', 'eu', 'iso'])
if fmt == 'us':
raw_date = f"{month:02d}/{day:02d}/{year}"
elif fmt == 'eu':
raw_date = f"{day:02d}.{month:02d}.{year}"
else:
raw_date = f"{year}-{month:02d}-{day:02d}"
examples.append(make(
f"[en] The deadline is {raw_date}.",
f"The deadline is {en_spoken}."
))
# PL: "25.05.2026" or "25 maja 2026"
pl_day_ord = n2w_ordinal(day, 'pl')
pl_year = n2w(year, 'pl')
pl_spoken = f"{pl_day_ord} {months_pl_gen[month-1]} {pl_year}"
raw_date_pl = f"{day:02d}.{month:02d}.{year}"
examples.append(make(
f"[pl] Termin to {raw_date_pl}.",
f"Termin to {pl_spoken}."
))
return examples
def gen_times(count=80):
"""Time expressions — 12h/24h."""
examples = []
for _ in range(count):
h24 = random.randint(0, 23)
m = random.randint(0, 59)
# 24h format
time_str = f"{h24:02d}:{m:02d}"
en_h = n2w(h24, 'en')
en_m = n2w(m, 'en') if m != 0 else ""
en_spoken = f"{en_h} {en_m}".strip() if m != 0 else f"{en_h} hundred" if h24 > 0 else "midnight"
pl_h = n2w(h24, 'pl')
pl_m = n2w(m, 'pl') if m != 0 else ""
pl_spoken = f"{pl_h} {pl_m}".strip()
examples.append(make(
f"[en] The meeting is at {time_str}.",
f"The meeting is at {en_spoken}."
))
examples.append(make(
f"[pl] Spotkanie jest o {time_str}.",
f"Spotkanie jest o {pl_spoken}."
))
# 12h format (EN only common)
h12 = h24 % 12 or 12
ampm = "AM" if h24 < 12 else "PM"
time_12 = f"{h12}:{m:02d} {ampm}"
en_12_spoken = f"{n2w(h12, 'en')} {n2w(m, 'en') if m else ''} {ampm}".strip()
examples.append(make(
f"[en] Alarm set for {time_12}.",
f"Alarm set for {en_12_spoken}."
))
return examples
def gen_currency(count=120):
"""Currency — any currency symbol in any language."""
examples = []
symbols = ['$', '', '', '£']
prefixed = {'$', '', '£'} # symbol before number
postfixed = {''} # symbol after number
for _ in range(count):
sym = random.choice(symbols)
n = random.choice([
random.randint(1, 99),
random.randint(100, 9999),
random.randint(10000, 999999),
])
if sym in prefixed:
raw_amount = f"{sym}{n:,}"
else:
raw_amount = f"{n:,} {sym}"
# English
tpl = random.choice(CURRENCY_TEMPLATES_EN)
examples.append(make(
f"[en] {tpl.format(raw_amount)}",
tpl.format(fmt_currency(n, sym, 'en'))
))
# Polish
tpl = random.choice(CURRENCY_TEMPLATES_PL)
examples.append(make(
f"[pl] {tpl.format(raw_amount)}",
tpl.format(fmt_currency(n, sym, 'pl'))
))
# Cents/grosze
for _ in range(20):
whole = random.randint(1, 999)
cents = random.randint(1, 99)
sym = random.choice(symbols)
if sym in prefixed:
raw = f"{sym}{whole}.{cents:02d}"
else:
raw = f"{whole}.{cents:02d} {sym}"
en_spoken = f"{n2w(whole, 'en')} {en_currency_name(sym, whole)} and {n2w(cents, 'en')} cents"
pl_spoken = f"{n2w(whole, 'pl')} {pl_currency_name(sym, whole)} i {n2w(cents, 'pl')} groszy"
examples.append(make(f"[en] Total: {raw}", f"Total: {en_spoken}"))
examples.append(make(f"[pl] Łącznie: {raw}", f"Łącznie: {pl_spoken}"))
return examples
def gen_units(count=80):
"""Units and measurements."""
units = {
'km': ('kilometers', 'kilometrów'),
'm': ('meters', 'metrów'),
'cm': ('centimeters', 'centymetrów'),
'mm': ('millimeters', 'milimetrów'),
'kg': ('kilograms', 'kilogramów'),
'g': ('grams', 'gramów'),
'mg': ('milligrams', 'miligramów'),
'l': ('liters', 'litrów'),
'ml': ('milliliters', 'mililitrów'),
'km/h': ('kilometers per hour', 'kilometrów na godzinę'),
'mph': ('miles per hour', 'mil na godzinę'),
'GB': ('gigabytes', 'gigabajtów'),
'MB': ('megabytes', 'megabajtów'),
'TB': ('terabytes', 'terabajtów'),
'GHz': ('gigahertz', 'gigaherców'),
'MHz': ('megahertz', 'megaherców'),
'kW': ('kilowatts', 'kilowatów'),
'W': ('watts', 'watów'),
'V': ('volts', 'woltów'),
'A': ('amperes', 'amperów'),
}
templates_en = [
"The speed is {} {}.",
"It weighs {} {}.",
"The capacity is {} {}.",
"Measured {} {}.",
"Maximum: {} {}.",
]
templates_pl = [
"Prędkość wynosi {} {}.",
"Waży {} {}.",
"Pojemność to {} {}.",
"Zmierzono {} {}.",
"Maksimum: {} {}.",
]
examples = []
for _ in range(count):
unit, (en_name, pl_name) = random.choice(list(units.items()))
n = random.choice([
random.randint(1, 99),
random.randint(100, 9999),
round(random.uniform(0.1, 99.9), 1),
])
raw_n = str(n)
if isinstance(n, float):
whole = int(n)
frac = str(n).split('.')[1]
en_w = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in frac)}"
pl_w = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in frac)}"
else:
en_w = n2w(n, 'en')
pl_w = n2w(n, 'pl')
tpl = random.choice(templates_en)
examples.append(make(
f"[en] {tpl.format(raw_n, unit)}",
tpl.format(en_w, en_name)
))
tpl = random.choice(templates_pl)
examples.append(make(
f"[pl] {tpl.format(raw_n, unit)}",
tpl.format(pl_w, pl_name)
))
return examples
def gen_temperatures(count=40):
"""Temperature expressions."""
examples = []
for _ in range(count):
t = random.randint(-30, 45)
examples.append(make(
f"[en] Current temperature: {t}°C.",
f"Current temperature: {n2w(t, 'en')} degrees Celsius."
))
examples.append(make(
f"[pl] Aktualna temperatura: {t}°C.",
f"Aktualna temperatura: {n2w(t, 'pl')} stopni Celsjusza."
))
f_temp = random.randint(0, 110)
examples.append(make(
f"[en] It's {f_temp}°F outside.",
f"It's {n2w(f_temp, 'en')} degrees Fahrenheit outside."
))
return examples
def gen_acronyms(count=150):
"""Acronyms — spelled out vs pronounced as words."""
examples = []
spelled_items = list(SPELLED_ACRONYMS.items())
for _ in range(count):
# Spelled acronym in sentence
acr, spelled = random.choice(spelled_items)
tpl = random.choice(ACRONYM_TEMPLATES_EN)
examples.append(make(
f"[en] {tpl.format(acr)}",
tpl.format(spelled)
))
tpl = random.choice(ACRONYM_TEMPLATES_PL)
examples.append(make(
f"[pl] {tpl.format(acr)}",
tpl.format(spelled)
))
# Word acronym (should NOT be spelled)
word_acr = random.choice(WORD_ACRONYMS)
examples.append(make(
f"[en] The {word_acr} project launched.",
f"The {word_acr} project launched."
))
examples.append(make(
f"[pl] Projekt {word_acr} wystartował.",
f"Projekt {word_acr} wystartował."
))
# Not-acronym all-caps (should lowercase or preserve)
not_acr = random.choice(NOT_ACRONYMS)
examples.append(make(
f"[en] {not_acr}: read the instructions.",
f"{not_acr}: read the instructions."
))
return examples
def gen_nato(count=100):
"""NATO phonetic alphabet — both encoding and decoding."""
examples = []
letters = list(NATO.keys())
for _ in range(count):
# Decode: "Alpha Bravo Charlie" → "ABC"
length = random.randint(2, 6)
chosen = random.sample(letters, length)
nato_str = ' '.join(NATO[c] for c in chosen)
letter_str = ''.join(chosen)
spelled_str = ' '.join(chosen)
examples.append(make(
f"[en] Callsign: {nato_str}.",
f"Callsign: {spelled_str}."
))
examples.append(make(
f"[pl] Znak wywoławczy: {nato_str}.",
f"Znak wywoławczy: {spelled_str}."
))
# Encode: spelled letters → NATO (reverse direction)
examples.append(make(
f"[en] Spell {letter_str} using NATO phonetic.",
f"{nato_str}."
))
examples.append(make(
f"[pl] Przeliteruj {letter_str} alfabetem NATO.",
f"{nato_str}."
))
# Decode in running text
word = ''.join(random.sample(letters, random.randint(3, 5)))
nato_word = ' '.join(NATO[c] for c in word)
spelled_word = ' '.join(word)
examples.append(make(
f"[en] The code is {nato_word}.",
f"The code is {spelled_word}."
))
return examples
def gen_spelling_corrections(count=80):
"""Fix common spelling errors."""
examples = []
en_items = [(bad, good) for bad, good in TYPOS_EN.items() if bad != good]
pl_items = [(bad, good) for bad, good in TYPOS_PL.items() if bad != good]
templates_en = [
"The {} was unexpected.",
"We need to {} the system.",
"It was {} to everyone.",
"The {} process failed.",
"Please {} the document.",
]
templates_pl = [
"To było {}.",
"Trzeba {} system.",
"Było to {} dla wszystkich.",
"Proces {} się nie powiódł.",
"Proszę {} dokument.",
]
for _ in range(count):
if en_items:
bad, good = random.choice(en_items)
tpl = random.choice(templates_en)
examples.append(make(
f"[en] {tpl.format(bad)}",
tpl.format(good)
))
if pl_items:
bad, good = random.choice(pl_items)
tpl = random.choice(templates_pl)
examples.append(make(
f"[pl] {tpl.format(bad)}",
tpl.format(good)
))
return examples
def gen_urls_emails(count=80):
"""URLs and email addresses to spoken form."""
examples = []
domains = ['github.com', 'google.com', 'example.org', 'openai.com',
'huggingface.co', 'reddit.com', 'stackoverflow.com',
'wikipedia.org', 'youtube.com', 'docs.python.org']
paths = ['/docs', '/api/v2', '/user/settings', '/search?q=test',
'/releases/latest', '/issues/42', '/wiki/Main_Page']
tlds = {'com': 'dot com', 'org': 'dot org', 'co': 'dot co',
'net': 'dot net', 'pl': 'dot P L', 'dev': 'dot dev'}
users = ['john', 'anna', 'admin', 'support', 'info', 'kontakt']
email_domains = ['gmail.com', 'outlook.com', 'firma.pl', 'example.org']
for _ in range(count):
# URL
domain = random.choice(domains)
path = random.choice(paths + [''])
url = f"https://{domain}{path}"
parts = domain.split('.')
tld = parts[-1]
name = ' dot '.join(parts[:-1])
spoken_domain = f"{name} {tlds.get(tld, f'dot {tld}')}"
spoken_path = path.replace('/', ' slash ').replace('?', ' question mark ').replace('=', ' equals ').replace('#', ' hash ').strip() if path else ''
spoken_url = f"{spoken_domain} {spoken_path}".strip()
examples.append(make(
f"[en] Visit {url} for details.",
f"Visit {spoken_url} for details."
))
examples.append(make(
f"[pl] Odwiedź {url} po szczegóły.",
f"Odwiedź {spoken_url} po szczegóły."
))
# Email
user = random.choice(users)
edom = random.choice(email_domains)
email = f"{user}@{edom}"
eparts = edom.split('.')
etld = eparts[-1]
ename = ' dot '.join(eparts[:-1])
spoken_email = f"{user} at {ename} {tlds.get(etld, f'dot {etld}')}"
examples.append(make(
f"[en] Contact us at {email}.",
f"Contact us at {spoken_email}."
))
examples.append(make(
f"[pl] Napisz do nas na {email}.",
f"Napisz do nas na {spoken_email}."
))
return examples
def gen_symbols(count=60):
"""Symbol expansion."""
symbol_map_en = {
'@': 'at', '&': 'and', '#': 'hash', '%': 'percent',
'+': 'plus', '=': 'equals', '/': 'slash', '\\': 'backslash',
'*': 'asterisk', '^': 'caret', '~': 'tilde', '|': 'pipe',
'<': 'less than', '>': 'greater than',
}
symbol_map_pl = {
'@': 'małpa', '&': 'i', '#': 'hash', '%': 'procent',
'+': 'plus', '=': 'równa się', '/': 'slash', '\\': 'backslash',
'*': 'gwiazdka', '^': 'daszek', '~': 'tylda', '|': 'kreska pionowa',
'<': 'mniejsze niż', '>': 'większe niż',
}
examples = []
for _ in range(count):
sym = random.choice(list(symbol_map_en.keys()))
examples.append(make(
f"[en] Press {sym} to continue.",
f"Press {symbol_map_en[sym]} to continue."
))
examples.append(make(
f"[pl] Naciśnij {sym} aby kontynuować.",
f"Naciśnij {symbol_map_pl[sym]} aby kontynuować."
))
# Math expressions
for _ in range(20):
a = random.randint(1, 100)
b = random.randint(1, 100)
op = random.choice(['+', '-', '*', '/'])
op_en = {'+': 'plus', '-': 'minus', '*': 'times', '/': 'divided by'}
op_pl = {'+': 'plus', '-': 'minus', '*': 'razy', '/': 'podzielone przez'}
examples.append(make(
f"[en] Calculate {a} {op} {b}.",
f"Calculate {n2w(a, 'en')} {op_en[op]} {n2w(b, 'en')}."
))
examples.append(make(
f"[pl] Oblicz {a} {op} {b}.",
f"Oblicz {n2w(a, 'pl')} {op_pl[op]} {n2w(b, 'pl')}."
))
return examples
def gen_markdown(count=60):
"""Strip markdown formatting."""
examples = []
md_pairs_en = [
("**important** update", "important update"),
("the *quick* brown fox", "the quick brown fox"),
("# Main Heading", "Main Heading"),
("## Section Two", "Section Two"),
("### Subsection", "Subsection"),
("Check `config.json` file", "Check config dot json file"),
("Use ```python\nprint()``` here", "Use print here"),
("Visit [our site](https://example.com)", "Visit our site"),
("- item one\n- item two", "item one, item two"),
("1. first step\n2. second step", "first step, second step"),
("> This is a quote", "This is a quote"),
("~~deleted~~ text", "deleted text"),
("__underline__ this", "underline this"),
("***bold italic*** text", "bold italic text"),
("some `inline code` here", "some inline code here"),
]
md_pairs_pl = [
("**ważna** aktualizacja", "ważna aktualizacja"),
("*szybki* brązowy lis", "szybki brązowy lis"),
("# Główny Nagłówek", "Główny Nagłówek"),
("## Sekcja Druga", "Sekcja Druga"),
("Sprawdź plik `config.json`", "Sprawdź plik config dot json"),
("Odwiedź [naszą stronę](https://example.com)", "Odwiedź naszą stronę"),
("- element pierwszy\n- element drugi", "element pierwszy, element drugi"),
("1. pierwszy krok\n2. drugi krok", "pierwszy krok, drugi krok"),
("> To jest cytat", "To jest cytat"),
("~~usunięty~~ tekst", "usunięty tekst"),
("***pogrubiona kursywa*** tekst", "pogrubiona kursywa tekst"),
]
for _ in range(count):
raw, norm = random.choice(md_pairs_en)
examples.append(make(f"[en] {raw}", norm))
raw, norm = random.choice(md_pairs_pl)
examples.append(make(f"[pl] {raw}", norm))
return examples
def gen_phone_numbers(count=40):
"""Phone number normalization — read as digit groups."""
examples = []
for _ in range(count):
# Polish phone: +48 XXX XXX XXX
g1 = random.randint(100, 999)
g2 = random.randint(100, 999)
g3 = random.randint(100, 999)
phone_pl = f"+48 {g1} {g2} {g3}"
spoken_digits = lambda n, lang: ' '.join(n2w(int(d), lang) for d in str(n))
pl_spoken = f"plus czterdzieści osiem, {spoken_digits(g1, 'pl')}, {spoken_digits(g2, 'pl')}, {spoken_digits(g3, 'pl')}"
en_spoken = f"plus forty-eight, {spoken_digits(g1, 'en')}, {spoken_digits(g2, 'en')}, {spoken_digits(g3, 'en')}"
examples.append(make(
f"[pl] Zadzwoń pod {phone_pl}.",
f"Zadzwoń pod {pl_spoken}."
))
examples.append(make(
f"[en] Call {phone_pl}.",
f"Call {en_spoken}."
))
# US phone: (XXX) XXX-XXXX
a = random.randint(200, 999)
b = random.randint(100, 999)
c = random.randint(1000, 9999)
phone_us = f"({a}) {b}-{c}"
en_us = f"{spoken_digits(a, 'en')}, {spoken_digits(b, 'en')}, {spoken_digits(c, 'en')}"
examples.append(make(
f"[en] Reach us at {phone_us}.",
f"Reach us at {en_us}."
))
return examples
def gen_versions_ips(count=40):
"""Version numbers and IP addresses."""
examples = []
for _ in range(count):
# Version: v3.2.1
major = random.randint(0, 20)
minor = random.randint(0, 30)
patch = random.randint(0, 99)
ver = f"v{major}.{minor}.{patch}"
en_ver = f"version {n2w(major, 'en')} point {n2w(minor, 'en')} point {n2w(patch, 'en')}"
pl_ver = f"wersja {n2w(major, 'pl')} kropka {n2w(minor, 'pl')} kropka {n2w(patch, 'pl')}"
examples.append(make(
f"[en] Upgrade to {ver}.",
f"Upgrade to {en_ver}."
))
examples.append(make(
f"[pl] Zaktualizuj do {ver}.",
f"Zaktualizuj do {pl_ver}."
))
# IP: 192.168.1.100
octets = [random.randint(0, 255) for _ in range(4)]
ip = '.'.join(str(o) for o in octets)
en_ip = ' dot '.join(n2w(o, 'en') for o in octets)
pl_ip = ' kropka '.join(n2w(o, 'pl') for o in octets)
examples.append(make(
f"[en] Connect to {ip}.",
f"Connect to {en_ip}."
))
examples.append(make(
f"[pl] Połącz się z {ip}.",
f"Połącz się z {pl_ip}."
))
return examples
def gen_passthrough(count=80):
"""Clean text that should pass through unchanged — negative examples."""
clean_en = [
"Hello, how are you doing today?",
"The quick brown fox jumps over the lazy dog.",
"Please close the door when you leave.",
"She walked slowly through the garden.",
"The meeting went well yesterday.",
"I think we should reconsider the approach.",
"Thank you for your help with this project.",
"The weather is beautiful this morning.",
"Let me know if you need anything else.",
"We appreciate your continued support.",
"The results exceeded our expectations.",
"Can you send me the updated report?",
"The new design looks great.",
"I'll follow up with you next week.",
"The system is running smoothly now.",
]
clean_pl = [
"Cześć, jak się masz?",
"Szybki brązowy lis przeskakuje nad leniwym psem.",
"Proszę zamknąć drzwi, gdy wychodzisz.",
"Szła powoli przez ogród.",
"Spotkanie przebiegło dobrze wczoraj.",
"Myślę, że powinniśmy ponownie rozważyć podejście.",
"Dziękuję za pomoc przy tym projekcie.",
"Pogoda jest piękna tego ranka.",
"Daj mi znać, jeśli potrzebujesz czegoś jeszcze.",
"Doceniamy wasze ciągłe wsparcie.",
"Wyniki przekroczyły nasze oczekiwania.",
"Czy możesz wysłać mi zaktualizowany raport?",
"Nowy projekt wygląda świetnie.",
"Skontaktuję się z tobą w przyszłym tygodniu.",
"System działa teraz płynnie.",
]
examples = []
for _ in range(count):
text = random.choice(clean_en)
examples.append(make(f"[en] {text}", text))
text = random.choice(clean_pl)
examples.append(make(f"[pl] {text}", text))
return examples
def gen_mixed(count=150):
"""Complex mixed examples with multiple normalization needs."""
examples = []
mixed_en = [
(
"[en] The CPU usage hit 95% at 14:30, costing us $4,500 in SLA penalties.",
"The C P U usage hit ninety-five percent at fourteen thirty, costing us four thousand five hundred dollars in S L A penalties."
),
(
"[en] **WARNING**: Server at 192.168.1.42 returned HTTP 503 error.",
"WARNING: Server at one hundred ninety-two dot one hundred sixty-eight dot one dot forty-two returned H T T P five hundred three error."
),
(
"[en] Send the PDF to john@example.com by 05/30/2026.",
"Send the P D F to john at example dot com by May thirtieth, twenty twenty-six."
),
(
"[en] The NASA rover traveled 3.7km at 0.5km/h on Mars.",
"The NASA rover traveled three point seven kilometers at zero point five kilometers per hour on Mars."
),
(
"[en] Update to v2.4.1 — fixes #347 & improves GPU performance by 12%.",
"Update to version two point four point one — fixes number three hundred forty-seven and improves G P U performance by twelve percent."
),
(
"[en] The FBI & CIA issued a joint FAQ about the VPN breach affecting 10,000+ users.",
"The F B I and C I A issued a joint F A Q about the V P N breach affecting ten thousand plus users."
),
(
"[en] Meeting at 3:00 PM in room 401. Budget: €250,000. Contact HR ASAP.",
"Meeting at three PM in room four hundred one. Budget: two hundred fifty thousand euros. Contact H R A S A P."
),
(
"[en] The LED display shows -5°C. The ATM is 200m away on 3rd street.",
"The L E D display shows minus five degrees Celsius. The A T M is two hundred meters away on third street."
),
(
"[en] Check https://docs.python.org/api/v3 for the SDK documentation.",
"Check docs dot python dot org slash api slash v three for the S D K documentation."
),
(
"[en] Flight BA247 departs at 08:15 from gate 12B. ETA: 11:30.",
"Flight B A two forty-seven departs at eight fifteen from gate twelve B. E T A: eleven thirty."
),
(
"[en] The IoT device uses 2.4GHz WiFi & draws 5W at 12V DC.",
"The I o T device uses two point four gigahertz WiFi and draws five watts at twelve volts D C."
),
(
"[en] She recieved the USB drive with 256GB of ML training data.",
"She received the U S B drive with two hundred fifty-six gigabytes of M L training data."
),
(
"[en] Callsign: Tango Alpha Foxtrot. Coordinates: 52°N, 21°E.",
"Callsign: T A F. Coordinates: fifty-two degrees north, twenty-one degrees east."
),
]
mixed_pl = [
(
"[pl] Użycie CPU osiągnęło 95% o 14:30, kosztując nas $4,500 kar SLA.",
"Użycie C P U osiągnęło dziewięćdziesiąt pięć procent o czternasta trzydzieści, kosztując nas cztery tysiące pięćset dolarów kar S L A."
),
(
"[pl] **UWAGA**: Serwer 192.168.1.42 zwrócił błąd HTTP 503.",
"UWAGA: Serwer sto dziewięćdziesiąt dwa kropka sto sześćdziesiąt osiem kropka jeden kropka czterdzieści dwa zwrócił błąd H T T P pięćset trzy."
),
(
"[pl] Wyślij PDF na kontakt@firma.pl do 30.05.2026.",
"Wyślij P D F na kontakt małpa firma dot P L do trzydziestego maja dwa tysiące dwudziestego szóstego."
),
(
"[pl] Łazik NASA przejechał 3,7km z prędkością 0,5km/h na Marsie.",
"Łazik NASA przejechał trzy przecinek siedem kilometrów z prędkością zero przecinek pięć kilometrów na godzinę na Marsie."
),
(
"[pl] Aktualizacja do v2.4.1 — naprawia #347 & poprawia wydajność GPU o 12%.",
"Aktualizacja do wersja dwa kropka cztery kropka jeden — naprawia numer trzysta czterdzieści siedem i poprawia wydajność G P U o dwanaście procent."
),
(
"[pl] ABW i CBA wydały wspólne FAQ o naruszeniu VPN dotyczącym 10000+ użytkowników.",
"A B W i C B A wydały wspólne F A Q o naruszeniu V P N dotyczącym dziesięć tysięcy plus użytkowników."
),
(
"[pl] Spotkanie o 15:00 w pokoju 401. Budżet: €250000. Skontaktuj się z HR jak najszybciej.",
"Spotkanie o piętnaście zero zero w pokoju czterysta jeden. Budżet: dwieście pięćdziesiąt tysięcy euro. Skontaktuj się z H R jak najszybciej."
),
(
"[pl] Wyświetlacz LED pokazuje -5°C. Bankomat jest 200m dalej na 3. ulicy.",
"Wyświetlacz L E D pokazuje minus pięć stopni Celsjusza. Bankomat jest dwieście metrów dalej na trzeciej ulicy."
),
(
"[pl] Sprawdź https://docs.python.org/api/v3 po dokumentację SDK.",
"Sprawdź docs dot python dot org slash api slash v trzy po dokumentację S D K."
),
(
"[pl] Lot BA247 odlatuje o 08:15 z bramki 12B. Przewidywany czas: 11:30.",
"Lot B A dwieście czterdzieści siedem odlatuje o osiem piętnaście z bramki dwanaście B. Przewidywany czas: jedenaście trzydzieści."
),
(
"[pl] Urządzenie IoT używa WiFi 2,4GHz i pobiera 5W przy 12V DC.",
"Urządzenie I o T używa WiFi dwa przecinek cztery gigaherców i pobiera pięć watów przy dwanaście woltów D C."
),
(
"[pl] Kupiła pendrive USB z 256GB danych treningowych ML za 149 zł.",
"Kupiła pendrive U S B z dwieście pięćdziesiąt sześć gigabajtów danych treningowych M L za sto czterdzieści dziewięć złotych."
),
(
"[pl] Znak wywoławczy: Tango Alfa Foxtrot. Współrzędne: 52°N, 21°E.",
"Znak wywoławczy: T A F. Współrzędne: pięćdziesiąt dwa stopnie północ, dwadzieścia jeden stopni wschód."
),
]
for pair in mixed_en:
examples.append(make(pair[0], pair[1]))
for pair in mixed_pl:
examples.append(make(pair[0], pair[1]))
# Generate more mixed by combining elements
for _ in range(count - len(mixed_en) - len(mixed_pl)):
n = random.randint(100, 9999)
acr = random.choice(list(SPELLED_ACRONYMS.keys()))
acr_sp = SPELLED_ACRONYMS[acr]
sym = random.choice(['$', '', '', '£'])
amount = random.randint(10, 99999)
if random.random() < 0.5:
cur_raw = f"{sym}{amount:,}" if sym in {'$', '', '£'} else f"{amount:,} {sym}"
examples.append(make(
f"[en] The {acr} report shows {n} entries totaling {cur_raw}.",
f"The {acr_sp} report shows {n2w(n, 'en')} entries totaling {fmt_currency(amount, sym, 'en')}."
))
else:
cur_raw = f"{sym}{amount:,}" if sym in {'$', '', '£'} else f"{amount:,} {sym}"
examples.append(make(
f"[pl] Raport {acr} pokazuje {n} wpisów na łącznie {cur_raw}.",
f"Raport {acr_sp} pokazuje {n2w(n, 'pl')} wpisów na łącznie {fmt_currency(amount, sym, 'pl')}."
))
return examples
def gen_abbreviations(count=40):
"""Common abbreviations."""
abbrevs_en = {
'Mr.': 'Mister', 'Mrs.': 'Misses', 'Dr.': 'Doctor',
'St.': 'Saint', 'Ave.': 'Avenue', 'Blvd.': 'Boulevard',
'Jr.': 'Junior', 'Sr.': 'Senior', 'Prof.': 'Professor',
'vs.': 'versus', 'etc.': 'et cetera', 'e.g.': 'for example',
'i.e.': 'that is', 'approx.': 'approximately',
'dept.': 'department', 'govt.': 'government',
'inc.': 'incorporated', 'corp.': 'corporation',
}
abbrevs_pl = {
'dr': 'doktor', 'prof.': 'profesor', 'mgr': 'magister',
'inż.': 'inżynier', 'ul.': 'ulica', 'al.': 'aleja',
'pl.': 'plac', 'os.': 'osiedle', 'nr': 'numer',
'tel.': 'telefon', 'godz.': 'godzina', 'ok.': 'około',
'np.': 'na przykład', 'tj.': 'to jest', 'itd.': 'i tak dalej',
'itp.': 'i tym podobne', 'wg': 'według', 'dot.': 'dotyczący',
}
examples = []
for _ in range(count):
abbr, full = random.choice(list(abbrevs_en.items()))
examples.append(make(
f"[en] {abbr} Smith will arrive at 5.",
f"{full} Smith will arrive at five."
))
abbr, full = random.choice(list(abbrevs_pl.items()))
examples.append(make(
f"[pl] {abbr} Kowalski przyjedzie o 5.",
f"{full} Kowalski przyjedzie o piąta."
))
return examples
def gen_contractions(count=40):
"""English contractions — normalize for TTS clarity."""
contractions = {
"don't": "do not", "doesn't": "does not", "didn't": "did not",
"won't": "will not", "wouldn't": "would not", "couldn't": "could not",
"shouldn't": "should not", "can't": "cannot", "isn't": "is not",
"aren't": "are not", "wasn't": "was not", "weren't": "were not",
"hasn't": "has not", "haven't": "have not", "hadn't": "had not",
"I'm": "I am", "I've": "I have", "I'll": "I will", "I'd": "I would",
"you're": "you are", "you've": "you have", "you'll": "you will",
"he's": "he is", "she's": "she is", "it's": "it is",
"we're": "we are", "we've": "we have", "we'll": "we will",
"they're": "they are", "they've": "they have", "they'll": "they will",
"that's": "that is", "there's": "there is", "here's": "here is",
"let's": "let us", "who's": "who is", "what's": "what is",
}
templates = [
"{} the right approach.",
"I think {} a good idea.",
"{} going to work out.",
"They said {} ready yet.",
"{} what we expected.",
]
examples = []
for _ in range(count):
contr, expanded = random.choice(list(contractions.items()))
tpl = random.choice(templates)
examples.append(make(
f"[en] {tpl.format(contr)}",
tpl.format(expanded)
))
return examples
def gen_large_numbers_shorthand(count=40):
"""Large number shorthands: 4.5M, 2.3B, 1.2K etc."""
suffixes = {
'K': (1_000, 'thousand', 'tysięcy'),
'M': (1_000_000, 'million', 'milionów'),
'B': (1_000_000_000, 'billion', 'miliardów'),
}
examples = []
for _ in range(count):
suffix, (mult, en_word, pl_word) = random.choice(list(suffixes.items()))
whole = random.randint(1, 99)
frac = random.choice([0, random.randint(1, 9)])
if frac:
raw = f"{whole}.{frac}{suffix}"
en_spoken = f"{n2w(whole, 'en')} point {n2w(frac, 'en')} {en_word}"
pl_spoken = f"{n2w(whole, 'pl')} przecinek {n2w(frac, 'pl')} {pl_word}"
else:
raw = f"{whole}{suffix}"
en_spoken = f"{n2w(whole, 'en')} {en_word}"
pl_spoken = f"{n2w(whole, 'pl')} {pl_word}"
for sym in ['$', '', '']:
if sym in {'$', ''}:
cur_raw = f"{sym}{raw}"
else:
cur_raw = f"{raw} {sym}"
if frac:
val_for_cur = whole # approximate — the model should learn the pattern
else:
val_for_cur = whole
examples.append(make(
f"[en] Revenue reached {cur_raw}.",
f"Revenue reached {en_spoken} {en_currency_name(sym, 2)}."
))
examples.append(make(
f"[pl] Przychody osiągnęły {cur_raw}.",
f"Przychody osiągnęły {pl_spoken} {pl_currency_name(sym, 5)}."
))
return examples
# ──────────────────────────────────────────────────────────────
# MAIN
# ──────────────────────────────────────────────────────────────
def main():
random.seed(SEED)
all_examples = []
generators = [
("numbers", gen_numbers, 200),
("negatives_decimals", gen_negatives_decimals, 80),
("ordinals", gen_ordinals, 80),
("percentages", gen_percentages, 60),
("dates", gen_dates, 100),
("times", gen_times, 80),
("currency", gen_currency, 120),
("units", gen_units, 80),
("temperatures", gen_temperatures, 40),
("acronyms", gen_acronyms, 150),
("nato", gen_nato, 100),
("spelling", gen_spelling_corrections, 80),
("urls_emails", gen_urls_emails, 80),
("symbols", gen_symbols, 60),
("markdown", gen_markdown, 60),
("phone_numbers", gen_phone_numbers, 40),
("versions_ips", gen_versions_ips, 40),
("passthrough", gen_passthrough, 80),
("mixed", gen_mixed, 150),
("abbreviations", gen_abbreviations, 40),
("contractions", gen_contractions, 40),
("large_numbers", gen_large_numbers_shorthand, 40),
]
for name, gen_fn, count in generators:
examples = gen_fn(count)
print(f" {name}: {len(examples)} examples")
all_examples.extend(examples)
random.shuffle(all_examples)
with OUTPUT.open("w", encoding="utf-8") as f:
for ex in all_examples:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print(f"\nTotal: {len(all_examples)} examples -> {OUTPUT}")
if __name__ == "__main__":
main()