122e73860b
gen_tts_dataset.py: 4960 synthetic examples, 22 categories (numbers, currencies, dates, times, temperatures, acronyms, NATO phonetic, URLs, markdown, etc). Bilingual EN/PL with explicit [lang] tag prefix. train_tts_norm.py: Unsloth LoRA training for Qwen2.5-7B-Instruct. Rank 16, 3 epochs, packing, max_seq 768. Trained on H100 in 20m38s, final loss 0.091. Adapter: 154MB.
1353 lines
51 KiB
Python
1353 lines
51 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate TTS normalization LoRA training dataset.
|
|
|
|
Language is EXPLICITLY tagged [en] or [pl] by the TTS client — no detection/inference.
|
|
Currency, units, and content are language-independent (Polish text can have dollars,
|
|
English text can have złoty).
|
|
|
|
Output: JSONL in ShareGPT conversation format for Unsloth/Qwen2.5-Instruct.
|
|
"""
|
|
|
|
import json
|
|
import random
|
|
import re
|
|
from pathlib import Path
|
|
from num2words import num2words
|
|
|
|
SEED = 42
|
|
OUTPUT = Path("tts_norm_dataset.jsonl")
|
|
|
|
SYSTEM = (
|
|
"You are a TTS text preprocessor. The input begins with a language tag [en] or [pl] "
|
|
"that specifies the target speech language. Normalize the text for natural speech synthesis: "
|
|
"expand numbers to words in the tagged language, handle acronyms (spell out letter-acronyms "
|
|
"with spaces between letters, keep pronounceable acronyms intact), fix spelling errors, "
|
|
"convert symbols and URLs to spoken form, and strip markdown formatting. "
|
|
"Output only the normalized text without the language tag."
|
|
)
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# NATO PHONETIC ALPHABET
|
|
# ──────────────────────────────────────────────────────────────
|
|
NATO = {
|
|
'A': 'Alfa', 'B': 'Bravo', 'C': 'Charlie', 'D': 'Delta',
|
|
'E': 'Echo', 'F': 'Foxtrot', 'G': 'Golf', 'H': 'Hotel',
|
|
'I': 'India', 'J': 'Juliett', 'K': 'Kilo', 'L': 'Lima',
|
|
'M': 'Mike', 'N': 'November', 'O': 'Oscar', 'P': 'Papa',
|
|
'Q': 'Quebec', 'R': 'Romeo', 'S': 'Sierra', 'T': 'Tango',
|
|
'U': 'Uniform', 'V': 'Victor', 'W': 'Whiskey', 'X': 'X-ray',
|
|
'Y': 'Yankee', 'Z': 'Zulu',
|
|
}
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# ACRONYMS — spelled letter-by-letter
|
|
# ──────────────────────────────────────────────────────────────
|
|
SPELLED_ACRONYMS = {
|
|
'API': 'A P I', 'CPU': 'C P U', 'GPU': 'G P U', 'RAM': 'R A M',
|
|
'SSD': 'S S D', 'HTTP': 'H T T P', 'HTTPS': 'H T T P S',
|
|
'URL': 'U R L', 'HTML': 'H T M L', 'CSS': 'C S S',
|
|
'FBI': 'F B I', 'CIA': 'C I A', 'NFL': 'N F L', 'NBA': 'N B A',
|
|
'MVP': 'M V P', 'CEO': 'C E O', 'CFO': 'C F O', 'CTO': 'C T O',
|
|
'AI': 'A I', 'ML': 'M L', 'OS': 'O S', 'UI': 'U I', 'UX': 'U X',
|
|
'IP': 'I P', 'VPN': 'V P N', 'DNS': 'D N S', 'SSH': 'S S H',
|
|
'FTP': 'F T P', 'SQL': 'S Q L', 'XML': 'X M L', 'JSON': 'J S O N',
|
|
'AWS': 'A W S', 'USB': 'U S B', 'HDMI': 'H D M I',
|
|
'LCD': 'L C D', 'LED': 'L E D', 'PDF': 'P D F', 'FAQ': 'F A Q',
|
|
'DIY': 'D I Y', 'ETA': 'E T A', 'FYI': 'F Y I', 'ASAP': 'A S A P',
|
|
'GPS': 'G P S', 'ATM': 'A T M', 'BBC': 'B B C', 'CNN': 'C N N',
|
|
'EU': 'E U', 'UN': 'U N', 'UK': 'U K', 'US': 'U S',
|
|
'RGB': 'R G B', 'TCP': 'T C P', 'UDP': 'U D P',
|
|
'IDE': 'I D E', 'SDK': 'S D K', 'CI': 'C I', 'CD': 'C D',
|
|
'PR': 'P R', 'QA': 'Q A', 'IT': 'I T', 'HR': 'H R',
|
|
'PKP': 'P K P', 'PZU': 'P Z U', 'ZUS': 'Z U S', 'NFZ': 'N F Z',
|
|
'PKO': 'P K O', 'TVP': 'T V P', 'TVN': 'T V N',
|
|
'KRS': 'K R S', 'NIP': 'N I P', 'VAT': 'V A T',
|
|
'ZTM': 'Z T M', 'MPK': 'M P K', 'GUS': 'G U S', 'PGE': 'P G E',
|
|
'NBP': 'N B P', 'PKB': 'P K B', 'UE': 'U E', 'ONZ': 'O N Z',
|
|
'RPO': 'R P O', 'NIK': 'N I K', 'SLD': 'S L D',
|
|
'PIS': 'P I S', 'PSL': 'P S L', 'PCK': 'P C K',
|
|
'AGH': 'A G H', 'UJ': 'U J', 'UW': 'U W', 'PWN': 'P W N',
|
|
'IPN': 'I P N', 'ABW': 'A B W', 'CBA': 'C B A',
|
|
'PIT': 'P I T', 'CIT': 'C I T', 'PESEL': 'P E S E L',
|
|
'TTS': 'T T S', 'NPC': 'N P C', 'RPG': 'R P G', 'PVP': 'P V P',
|
|
'DPS': 'D P S', 'MMO': 'M M O', 'RNG': 'R N G', 'AFK': 'A F K',
|
|
'LLM': 'L L M', 'NLP': 'N L P', 'OCR': 'O C R', 'ORM': 'O R M',
|
|
}
|
|
|
|
# Acronyms pronounced as words (not spelled)
|
|
WORD_ACRONYMS = [
|
|
'NASA', 'NATO', 'LASER', 'RADAR', 'SCUBA', 'PIN', 'SIM', 'BIOS',
|
|
'CAPTCHA', 'AWOL', 'GIF', 'JPEG', 'GOPR', 'IKEA', 'FIAT',
|
|
'UNICEF', 'UNESCO', 'AIDS', 'COVID', 'BASIC', 'SWIFT', 'DART',
|
|
'RUST', 'AJAX', 'LIDAR', 'MODEM', 'PIXEL',
|
|
]
|
|
|
|
# Words that LOOK like acronyms (all-caps) but are just regular words
|
|
NOT_ACRONYMS = [
|
|
'NVIDIA', 'HELLO', 'STOP', 'WARNING', 'ERROR', 'DANGER',
|
|
'IMPORTANT', 'NOTE', 'URGENT', 'ATTENTION', 'WELCOME', 'EXIT',
|
|
'OPEN', 'CLOSE', 'START', 'FINISH', 'UWAGA', 'STOP', 'WEJŚCIE',
|
|
'WYJŚCIE', 'ZAMKNIĘTE', 'OTWARTE',
|
|
]
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# SPELLING ERRORS
|
|
# ──────────────────────────────────────────────────────────────
|
|
TYPOS_EN = {
|
|
'recieve': 'receive', 'definately': 'definitely', 'occured': 'occurred',
|
|
'seperate': 'separate', 'accomodate': 'accommodate', 'neccessary': 'necessary',
|
|
'wierd': 'weird', 'occassion': 'occasion', 'concious': 'conscious',
|
|
'enviroment': 'environment', 'goverment': 'government', 'independant': 'independent',
|
|
'knowlege': 'knowledge', 'langauge': 'language', 'maintainance': 'maintenance',
|
|
'millenium': 'millennium', 'noticable': 'noticeable', 'persistant': 'persistent',
|
|
'publically': 'publicly', 'recomend': 'recommend', 'refering': 'referring',
|
|
'successfull': 'successful', 'suprise': 'surprise', 'tommorow': 'tomorrow',
|
|
'untill': 'until', 'wether': 'whether', 'wich': 'which',
|
|
'thier': 'their', 'teh': 'the', 'adn': 'and', 'hte': 'the',
|
|
'becuase': 'because', 'beleive': 'believe', 'calender': 'calendar',
|
|
'collegue': 'colleague', 'comittee': 'committee', 'dissapoint': 'disappoint',
|
|
'embarass': 'embarrass', 'existance': 'existence', 'foriegn': 'foreign',
|
|
'gurantee': 'guarantee', 'harrass': 'harass', 'imediately': 'immediately',
|
|
'jewlery': 'jewelry', 'judgement': 'judgment', 'liason': 'liaison',
|
|
}
|
|
|
|
TYPOS_PL = {
|
|
'rząd': 'rząd', # correct, no-op example
|
|
'wziąść': 'wziąć', 'włanczyć': 'włączyć', 'poszłem': 'poszedłem',
|
|
'pokarze': 'pokaże', 'napewno': 'na pewno', 'wogóle': 'w ogóle',
|
|
'niewiem': 'nie wiem', 'przedewszystkim': 'przede wszystkim',
|
|
'conajmniej': 'co najmniej', 'natomist': 'natomiast',
|
|
'ponieważ': 'ponieważ', # correct
|
|
'żadko': 'rzadko', 'bynajmiej': 'bynajmniej',
|
|
'jakby': 'jakby', # correct
|
|
'pomóżcie': 'pomóżcie', # correct
|
|
'włożyłam': 'włożyłam', # correct
|
|
'spróbój': 'spróbuj', 'wyrzygnąć': 'wyrzucić',
|
|
'przyjeżdzać': 'przyjeżdżać', 'żółtko': 'żółtko', # correct
|
|
'gżegżółka': 'gżegżółka', # correct — hard word
|
|
'sprawdźić': 'sprawdzić', 'ząmówić': 'zamówić',
|
|
'orginalny': 'oryginalny', 'symultaniczny': 'symultaniczny',
|
|
'odzwyczaić': 'odzwyczaić', # correct
|
|
'czterysta': 'czterysta', # correct
|
|
'rzentelny': 'rzetelny', 'wchodzić': 'wchodzić', # correct
|
|
'porządże': 'porządze', # intentional trap
|
|
}
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# SENTENCE TEMPLATES (with {} placeholders)
|
|
# ──────────────────────────────────────────────────────────────
|
|
NUM_TEMPLATES_EN = [
|
|
"There are {} items in the queue.",
|
|
"The building has {} floors.",
|
|
"She scored {} points in the final round.",
|
|
"Approximately {} people attended the event.",
|
|
"{} units were shipped yesterday.",
|
|
"The file contains {} lines of code.",
|
|
"He waited {} minutes for the train.",
|
|
"The population reached {} last year.",
|
|
"We need {} more signatures.",
|
|
"The distance is {} meters.",
|
|
"Page {} of the document.",
|
|
"Chapter {} covers advanced topics.",
|
|
"Flight {} departs at noon.",
|
|
"Room {} is on the third floor.",
|
|
"The team completed {} sprints this quarter.",
|
|
]
|
|
|
|
NUM_TEMPLATES_PL = [
|
|
"W kolejce jest {} elementów.",
|
|
"Budynek ma {} pięter.",
|
|
"Zdobyła {} punktów w finale.",
|
|
"Na wydarzeniu pojawiło się około {} osób.",
|
|
"Wysłano {} jednostek wczoraj.",
|
|
"Plik zawiera {} linii kodu.",
|
|
"Czekał {} minut na pociąg.",
|
|
"Populacja osiągnęła {} w zeszłym roku.",
|
|
"Potrzebujemy jeszcze {} podpisów.",
|
|
"Odległość wynosi {} metrów.",
|
|
"Strona {} dokumentu.",
|
|
"Rozdział {} obejmuje zaawansowane tematy.",
|
|
"Lot {} odlatuje w południe.",
|
|
"Pokój {} jest na trzecim piętrze.",
|
|
"Zespół ukończył {} sprintów w tym kwartale.",
|
|
]
|
|
|
|
CURRENCY_TEMPLATES_EN = [
|
|
"The total is {}.",
|
|
"She paid {}.",
|
|
"The budget was set at {}.",
|
|
"It costs {} per unit.",
|
|
"They raised {} for charity.",
|
|
"The invoice shows {}.",
|
|
"Repairs will cost approximately {}.",
|
|
"The price dropped to {}.",
|
|
]
|
|
|
|
CURRENCY_TEMPLATES_PL = [
|
|
"Łącznie to {}.",
|
|
"Zapłaciła {}.",
|
|
"Budżet ustalono na {}.",
|
|
"Kosztuje {} za sztukę.",
|
|
"Zebrali {} na cele charytatywne.",
|
|
"Faktura pokazuje {}.",
|
|
"Naprawa będzie kosztować około {}.",
|
|
"Cena spadła do {}.",
|
|
]
|
|
|
|
ACRONYM_TEMPLATES_EN = [
|
|
"The {} system is down.",
|
|
"Check the {} settings.",
|
|
"We're migrating to {}.",
|
|
"The {} update is ready.",
|
|
"{} performance improved significantly.",
|
|
"Connect via {} to the server.",
|
|
"The {} team approved the request.",
|
|
]
|
|
|
|
ACRONYM_TEMPLATES_PL = [
|
|
"System {} nie działa.",
|
|
"Sprawdź ustawienia {}.",
|
|
"Migrujemy do {}.",
|
|
"Aktualizacja {} jest gotowa.",
|
|
"Wydajność {} znacząco się poprawiła.",
|
|
"Połącz się przez {} z serwerem.",
|
|
"Zespół {} zatwierdził wniosek.",
|
|
]
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# HELPERS
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
def n2w(n, lang):
|
|
"""Number to words via num2words. Handles negative bug in PL."""
|
|
if isinstance(n, (int, float)) and n < 0:
|
|
return f"minus {num2words(abs(n), lang=lang)}"
|
|
return num2words(n, lang=lang)
|
|
|
|
def n2w_ordinal(n, lang):
|
|
"""Ordinal number to words."""
|
|
return num2words(n, to='ordinal', lang=lang)
|
|
|
|
def en_currency_name(symbol, n):
|
|
"""Currency name in English for any symbol."""
|
|
names = {
|
|
'$': ('dollar', 'dollars'),
|
|
'€': ('euro', 'euros'),
|
|
'zł': ('zloty', 'zloty'),
|
|
'£': ('pound', 'pounds'),
|
|
'¥': ('yen', 'yen'),
|
|
'CHF': ('Swiss franc', 'Swiss francs'),
|
|
'kr': ('krona', 'kronor'),
|
|
}
|
|
sing, plur = names.get(symbol, (symbol, symbol))
|
|
return sing if abs(n) == 1 else plur
|
|
|
|
def pl_currency_name(symbol, n):
|
|
"""Currency name in Polish for any symbol — with declension."""
|
|
last_d = abs(n) % 10
|
|
last_2d = abs(n) % 100
|
|
|
|
def pick(one, few, many):
|
|
if abs(n) == 1:
|
|
return one
|
|
elif 2 <= last_d <= 4 and not (12 <= last_2d <= 14):
|
|
return few
|
|
else:
|
|
return many
|
|
|
|
table = {
|
|
'$': ('dolar', 'dolary', 'dolarów'),
|
|
'€': ('euro', 'euro', 'euro'),
|
|
'zł': ('złoty', 'złote', 'złotych'),
|
|
'£': ('funt', 'funty', 'funtów'),
|
|
'¥': ('jen', 'jeny', 'jenów'),
|
|
'CHF': ('frank szwajcarski', 'franki szwajcarskie', 'franków szwajcarskich'),
|
|
'kr': ('korona', 'korony', 'koron'),
|
|
}
|
|
forms = table.get(symbol, (symbol, symbol, symbol))
|
|
return pick(*forms)
|
|
|
|
def fmt_currency(n, symbol, lang):
|
|
"""Format currency amount as spoken text."""
|
|
word_n = n2w(n, lang)
|
|
if lang == 'en':
|
|
return f"{word_n} {en_currency_name(symbol, n)}"
|
|
else:
|
|
return f"{word_n} {pl_currency_name(symbol, n)}"
|
|
|
|
def make(raw, norm):
|
|
"""Make one ShareGPT conversation example."""
|
|
return {
|
|
"conversations": [
|
|
{"from": "system", "value": SYSTEM},
|
|
{"from": "human", "value": raw},
|
|
{"from": "gpt", "value": norm},
|
|
]
|
|
}
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# GENERATORS
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
def gen_numbers(count=200):
|
|
"""Cardinal numbers in sentence context."""
|
|
examples = []
|
|
ranges = [
|
|
(0, 20), (21, 99), (100, 999), (1000, 9999),
|
|
(10000, 99999), (100000, 999999),
|
|
]
|
|
for _ in range(count):
|
|
lo, hi = random.choice(ranges)
|
|
n = random.randint(lo, hi)
|
|
|
|
# English
|
|
tpl = random.choice(NUM_TEMPLATES_EN)
|
|
examples.append(make(
|
|
f"[en] {tpl.format(n)}",
|
|
tpl.format(n2w(n, 'en'))
|
|
))
|
|
|
|
# Polish
|
|
tpl = random.choice(NUM_TEMPLATES_PL)
|
|
examples.append(make(
|
|
f"[pl] {tpl.format(n)}",
|
|
tpl.format(n2w(n, 'pl'))
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_negatives_decimals(count=80):
|
|
"""Negative numbers and decimals."""
|
|
examples = []
|
|
for _ in range(count):
|
|
# Negative integers
|
|
n = -random.randint(1, 500)
|
|
examples.append(make(
|
|
f"[en] The temperature is {n} degrees.",
|
|
f"The temperature is {n2w(n, 'en')} degrees."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Temperatura wynosi {n} stopni.",
|
|
f"Temperatura wynosi {n2w(n, 'pl')} stopni."
|
|
))
|
|
|
|
# Decimals
|
|
whole = random.randint(0, 999)
|
|
frac = random.randint(1, 99)
|
|
dec_str = f"{whole}.{frac}"
|
|
dec_val = float(dec_str)
|
|
|
|
en_spoken = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in str(frac))}"
|
|
pl_spoken = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in str(frac))}"
|
|
|
|
examples.append(make(
|
|
f"[en] The measurement reads {dec_str}.",
|
|
f"The measurement reads {en_spoken}."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Pomiar wskazuje {dec_str}.",
|
|
f"Pomiar wskazuje {pl_spoken}."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_ordinals(count=80):
|
|
"""Ordinal numbers."""
|
|
examples = []
|
|
for _ in range(count):
|
|
n = random.randint(1, 100)
|
|
suffix_raw = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
|
|
if 11 <= n % 100 <= 13:
|
|
suffix_raw = 'th'
|
|
|
|
examples.append(make(
|
|
f"[en] This is the {n}{suffix_raw} attempt.",
|
|
f"This is the {n2w_ordinal(n, 'en')} attempt."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] To jest {n}. próba.",
|
|
f"To jest {n2w_ordinal(n, 'pl')} próba."
|
|
))
|
|
return examples
|
|
|
|
|
|
def gen_percentages(count=60):
|
|
"""Percentage expressions."""
|
|
examples = []
|
|
for _ in range(count):
|
|
n = random.choice([
|
|
random.randint(0, 100),
|
|
round(random.uniform(0, 100), 1),
|
|
])
|
|
|
|
if isinstance(n, float):
|
|
whole = int(n)
|
|
frac = str(n).split('.')[1]
|
|
en_w = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in frac)}"
|
|
pl_w = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in frac)}"
|
|
else:
|
|
en_w = n2w(n, 'en')
|
|
pl_w = n2w(n, 'pl')
|
|
|
|
examples.append(make(
|
|
f"[en] The success rate is {n}%.",
|
|
f"The success rate is {en_w} percent."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Wskaźnik sukcesu wynosi {n}%.",
|
|
f"Wskaźnik sukcesu wynosi {pl_w} procent."
|
|
))
|
|
return examples
|
|
|
|
|
|
def gen_dates(count=100):
|
|
"""Date expressions in various formats."""
|
|
examples = []
|
|
|
|
months_en = ['January', 'February', 'March', 'April', 'May', 'June',
|
|
'July', 'August', 'September', 'October', 'November', 'December']
|
|
months_pl_gen = ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
|
|
'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia']
|
|
|
|
for _ in range(count):
|
|
day = random.randint(1, 28)
|
|
month = random.randint(1, 12)
|
|
year = random.randint(1990, 2030)
|
|
|
|
# EN: "05/25/2026" or "25.05.2026" or "2026-05-25"
|
|
en_day_ord = n2w_ordinal(day, 'en')
|
|
en_year = n2w(year, 'en')
|
|
en_spoken = f"{months_en[month-1]} {en_day_ord}, {en_year}"
|
|
|
|
fmt = random.choice(['us', 'eu', 'iso'])
|
|
if fmt == 'us':
|
|
raw_date = f"{month:02d}/{day:02d}/{year}"
|
|
elif fmt == 'eu':
|
|
raw_date = f"{day:02d}.{month:02d}.{year}"
|
|
else:
|
|
raw_date = f"{year}-{month:02d}-{day:02d}"
|
|
|
|
examples.append(make(
|
|
f"[en] The deadline is {raw_date}.",
|
|
f"The deadline is {en_spoken}."
|
|
))
|
|
|
|
# PL: "25.05.2026" or "25 maja 2026"
|
|
pl_day_ord = n2w_ordinal(day, 'pl')
|
|
pl_year = n2w(year, 'pl')
|
|
pl_spoken = f"{pl_day_ord} {months_pl_gen[month-1]} {pl_year}"
|
|
|
|
raw_date_pl = f"{day:02d}.{month:02d}.{year}"
|
|
examples.append(make(
|
|
f"[pl] Termin to {raw_date_pl}.",
|
|
f"Termin to {pl_spoken}."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_times(count=80):
|
|
"""Time expressions — 12h/24h."""
|
|
examples = []
|
|
for _ in range(count):
|
|
h24 = random.randint(0, 23)
|
|
m = random.randint(0, 59)
|
|
|
|
# 24h format
|
|
time_str = f"{h24:02d}:{m:02d}"
|
|
en_h = n2w(h24, 'en')
|
|
en_m = n2w(m, 'en') if m != 0 else ""
|
|
en_spoken = f"{en_h} {en_m}".strip() if m != 0 else f"{en_h} hundred" if h24 > 0 else "midnight"
|
|
|
|
pl_h = n2w(h24, 'pl')
|
|
pl_m = n2w(m, 'pl') if m != 0 else ""
|
|
pl_spoken = f"{pl_h} {pl_m}".strip()
|
|
|
|
examples.append(make(
|
|
f"[en] The meeting is at {time_str}.",
|
|
f"The meeting is at {en_spoken}."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Spotkanie jest o {time_str}.",
|
|
f"Spotkanie jest o {pl_spoken}."
|
|
))
|
|
|
|
# 12h format (EN only common)
|
|
h12 = h24 % 12 or 12
|
|
ampm = "AM" if h24 < 12 else "PM"
|
|
time_12 = f"{h12}:{m:02d} {ampm}"
|
|
en_12_spoken = f"{n2w(h12, 'en')} {n2w(m, 'en') if m else ''} {ampm}".strip()
|
|
examples.append(make(
|
|
f"[en] Alarm set for {time_12}.",
|
|
f"Alarm set for {en_12_spoken}."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_currency(count=120):
|
|
"""Currency — any currency symbol in any language."""
|
|
examples = []
|
|
symbols = ['$', '€', 'zł', '£']
|
|
prefixed = {'$', '€', '£'} # symbol before number
|
|
postfixed = {'zł'} # symbol after number
|
|
|
|
for _ in range(count):
|
|
sym = random.choice(symbols)
|
|
n = random.choice([
|
|
random.randint(1, 99),
|
|
random.randint(100, 9999),
|
|
random.randint(10000, 999999),
|
|
])
|
|
|
|
if sym in prefixed:
|
|
raw_amount = f"{sym}{n:,}"
|
|
else:
|
|
raw_amount = f"{n:,} {sym}"
|
|
|
|
# English
|
|
tpl = random.choice(CURRENCY_TEMPLATES_EN)
|
|
examples.append(make(
|
|
f"[en] {tpl.format(raw_amount)}",
|
|
tpl.format(fmt_currency(n, sym, 'en'))
|
|
))
|
|
|
|
# Polish
|
|
tpl = random.choice(CURRENCY_TEMPLATES_PL)
|
|
examples.append(make(
|
|
f"[pl] {tpl.format(raw_amount)}",
|
|
tpl.format(fmt_currency(n, sym, 'pl'))
|
|
))
|
|
|
|
# Cents/grosze
|
|
for _ in range(20):
|
|
whole = random.randint(1, 999)
|
|
cents = random.randint(1, 99)
|
|
sym = random.choice(symbols)
|
|
|
|
if sym in prefixed:
|
|
raw = f"{sym}{whole}.{cents:02d}"
|
|
else:
|
|
raw = f"{whole}.{cents:02d} {sym}"
|
|
|
|
en_spoken = f"{n2w(whole, 'en')} {en_currency_name(sym, whole)} and {n2w(cents, 'en')} cents"
|
|
pl_spoken = f"{n2w(whole, 'pl')} {pl_currency_name(sym, whole)} i {n2w(cents, 'pl')} groszy"
|
|
|
|
examples.append(make(f"[en] Total: {raw}", f"Total: {en_spoken}"))
|
|
examples.append(make(f"[pl] Łącznie: {raw}", f"Łącznie: {pl_spoken}"))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_units(count=80):
|
|
"""Units and measurements."""
|
|
units = {
|
|
'km': ('kilometers', 'kilometrów'),
|
|
'm': ('meters', 'metrów'),
|
|
'cm': ('centimeters', 'centymetrów'),
|
|
'mm': ('millimeters', 'milimetrów'),
|
|
'kg': ('kilograms', 'kilogramów'),
|
|
'g': ('grams', 'gramów'),
|
|
'mg': ('milligrams', 'miligramów'),
|
|
'l': ('liters', 'litrów'),
|
|
'ml': ('milliliters', 'mililitrów'),
|
|
'km/h': ('kilometers per hour', 'kilometrów na godzinę'),
|
|
'mph': ('miles per hour', 'mil na godzinę'),
|
|
'GB': ('gigabytes', 'gigabajtów'),
|
|
'MB': ('megabytes', 'megabajtów'),
|
|
'TB': ('terabytes', 'terabajtów'),
|
|
'GHz': ('gigahertz', 'gigaherców'),
|
|
'MHz': ('megahertz', 'megaherców'),
|
|
'kW': ('kilowatts', 'kilowatów'),
|
|
'W': ('watts', 'watów'),
|
|
'V': ('volts', 'woltów'),
|
|
'A': ('amperes', 'amperów'),
|
|
}
|
|
|
|
templates_en = [
|
|
"The speed is {} {}.",
|
|
"It weighs {} {}.",
|
|
"The capacity is {} {}.",
|
|
"Measured {} {}.",
|
|
"Maximum: {} {}.",
|
|
]
|
|
templates_pl = [
|
|
"Prędkość wynosi {} {}.",
|
|
"Waży {} {}.",
|
|
"Pojemność to {} {}.",
|
|
"Zmierzono {} {}.",
|
|
"Maksimum: {} {}.",
|
|
]
|
|
|
|
examples = []
|
|
for _ in range(count):
|
|
unit, (en_name, pl_name) = random.choice(list(units.items()))
|
|
n = random.choice([
|
|
random.randint(1, 99),
|
|
random.randint(100, 9999),
|
|
round(random.uniform(0.1, 99.9), 1),
|
|
])
|
|
|
|
raw_n = str(n)
|
|
if isinstance(n, float):
|
|
whole = int(n)
|
|
frac = str(n).split('.')[1]
|
|
en_w = f"{n2w(whole, 'en')} point {' '.join(n2w(int(d), 'en') for d in frac)}"
|
|
pl_w = f"{n2w(whole, 'pl')} przecinek {' '.join(n2w(int(d), 'pl') for d in frac)}"
|
|
else:
|
|
en_w = n2w(n, 'en')
|
|
pl_w = n2w(n, 'pl')
|
|
|
|
tpl = random.choice(templates_en)
|
|
examples.append(make(
|
|
f"[en] {tpl.format(raw_n, unit)}",
|
|
tpl.format(en_w, en_name)
|
|
))
|
|
|
|
tpl = random.choice(templates_pl)
|
|
examples.append(make(
|
|
f"[pl] {tpl.format(raw_n, unit)}",
|
|
tpl.format(pl_w, pl_name)
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_temperatures(count=40):
|
|
"""Temperature expressions."""
|
|
examples = []
|
|
for _ in range(count):
|
|
t = random.randint(-30, 45)
|
|
examples.append(make(
|
|
f"[en] Current temperature: {t}°C.",
|
|
f"Current temperature: {n2w(t, 'en')} degrees Celsius."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Aktualna temperatura: {t}°C.",
|
|
f"Aktualna temperatura: {n2w(t, 'pl')} stopni Celsjusza."
|
|
))
|
|
|
|
f_temp = random.randint(0, 110)
|
|
examples.append(make(
|
|
f"[en] It's {f_temp}°F outside.",
|
|
f"It's {n2w(f_temp, 'en')} degrees Fahrenheit outside."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_acronyms(count=150):
|
|
"""Acronyms — spelled out vs pronounced as words."""
|
|
examples = []
|
|
spelled_items = list(SPELLED_ACRONYMS.items())
|
|
|
|
for _ in range(count):
|
|
# Spelled acronym in sentence
|
|
acr, spelled = random.choice(spelled_items)
|
|
|
|
tpl = random.choice(ACRONYM_TEMPLATES_EN)
|
|
examples.append(make(
|
|
f"[en] {tpl.format(acr)}",
|
|
tpl.format(spelled)
|
|
))
|
|
|
|
tpl = random.choice(ACRONYM_TEMPLATES_PL)
|
|
examples.append(make(
|
|
f"[pl] {tpl.format(acr)}",
|
|
tpl.format(spelled)
|
|
))
|
|
|
|
# Word acronym (should NOT be spelled)
|
|
word_acr = random.choice(WORD_ACRONYMS)
|
|
examples.append(make(
|
|
f"[en] The {word_acr} project launched.",
|
|
f"The {word_acr} project launched."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Projekt {word_acr} wystartował.",
|
|
f"Projekt {word_acr} wystartował."
|
|
))
|
|
|
|
# Not-acronym all-caps (should lowercase or preserve)
|
|
not_acr = random.choice(NOT_ACRONYMS)
|
|
examples.append(make(
|
|
f"[en] {not_acr}: read the instructions.",
|
|
f"{not_acr}: read the instructions."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_nato(count=100):
|
|
"""NATO phonetic alphabet — both encoding and decoding."""
|
|
examples = []
|
|
letters = list(NATO.keys())
|
|
|
|
for _ in range(count):
|
|
# Decode: "Alpha Bravo Charlie" → "ABC"
|
|
length = random.randint(2, 6)
|
|
chosen = random.sample(letters, length)
|
|
nato_str = ' '.join(NATO[c] for c in chosen)
|
|
letter_str = ''.join(chosen)
|
|
spelled_str = ' '.join(chosen)
|
|
|
|
examples.append(make(
|
|
f"[en] Callsign: {nato_str}.",
|
|
f"Callsign: {spelled_str}."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Znak wywoławczy: {nato_str}.",
|
|
f"Znak wywoławczy: {spelled_str}."
|
|
))
|
|
|
|
# Encode: spelled letters → NATO (reverse direction)
|
|
examples.append(make(
|
|
f"[en] Spell {letter_str} using NATO phonetic.",
|
|
f"{nato_str}."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Przeliteruj {letter_str} alfabetem NATO.",
|
|
f"{nato_str}."
|
|
))
|
|
|
|
# Decode in running text
|
|
word = ''.join(random.sample(letters, random.randint(3, 5)))
|
|
nato_word = ' '.join(NATO[c] for c in word)
|
|
spelled_word = ' '.join(word)
|
|
|
|
examples.append(make(
|
|
f"[en] The code is {nato_word}.",
|
|
f"The code is {spelled_word}."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_spelling_corrections(count=80):
|
|
"""Fix common spelling errors."""
|
|
examples = []
|
|
|
|
en_items = [(bad, good) for bad, good in TYPOS_EN.items() if bad != good]
|
|
pl_items = [(bad, good) for bad, good in TYPOS_PL.items() if bad != good]
|
|
|
|
templates_en = [
|
|
"The {} was unexpected.",
|
|
"We need to {} the system.",
|
|
"It was {} to everyone.",
|
|
"The {} process failed.",
|
|
"Please {} the document.",
|
|
]
|
|
templates_pl = [
|
|
"To było {}.",
|
|
"Trzeba {} system.",
|
|
"Było to {} dla wszystkich.",
|
|
"Proces {} się nie powiódł.",
|
|
"Proszę {} dokument.",
|
|
]
|
|
|
|
for _ in range(count):
|
|
if en_items:
|
|
bad, good = random.choice(en_items)
|
|
tpl = random.choice(templates_en)
|
|
examples.append(make(
|
|
f"[en] {tpl.format(bad)}",
|
|
tpl.format(good)
|
|
))
|
|
|
|
if pl_items:
|
|
bad, good = random.choice(pl_items)
|
|
tpl = random.choice(templates_pl)
|
|
examples.append(make(
|
|
f"[pl] {tpl.format(bad)}",
|
|
tpl.format(good)
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_urls_emails(count=80):
|
|
"""URLs and email addresses to spoken form."""
|
|
examples = []
|
|
|
|
domains = ['github.com', 'google.com', 'example.org', 'openai.com',
|
|
'huggingface.co', 'reddit.com', 'stackoverflow.com',
|
|
'wikipedia.org', 'youtube.com', 'docs.python.org']
|
|
paths = ['/docs', '/api/v2', '/user/settings', '/search?q=test',
|
|
'/releases/latest', '/issues/42', '/wiki/Main_Page']
|
|
tlds = {'com': 'dot com', 'org': 'dot org', 'co': 'dot co',
|
|
'net': 'dot net', 'pl': 'dot P L', 'dev': 'dot dev'}
|
|
users = ['john', 'anna', 'admin', 'support', 'info', 'kontakt']
|
|
email_domains = ['gmail.com', 'outlook.com', 'firma.pl', 'example.org']
|
|
|
|
for _ in range(count):
|
|
# URL
|
|
domain = random.choice(domains)
|
|
path = random.choice(paths + [''])
|
|
url = f"https://{domain}{path}"
|
|
|
|
parts = domain.split('.')
|
|
tld = parts[-1]
|
|
name = ' dot '.join(parts[:-1])
|
|
spoken_domain = f"{name} {tlds.get(tld, f'dot {tld}')}"
|
|
spoken_path = path.replace('/', ' slash ').replace('?', ' question mark ').replace('=', ' equals ').replace('#', ' hash ').strip() if path else ''
|
|
spoken_url = f"{spoken_domain} {spoken_path}".strip()
|
|
|
|
examples.append(make(
|
|
f"[en] Visit {url} for details.",
|
|
f"Visit {spoken_url} for details."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Odwiedź {url} po szczegóły.",
|
|
f"Odwiedź {spoken_url} po szczegóły."
|
|
))
|
|
|
|
# Email
|
|
user = random.choice(users)
|
|
edom = random.choice(email_domains)
|
|
email = f"{user}@{edom}"
|
|
eparts = edom.split('.')
|
|
etld = eparts[-1]
|
|
ename = ' dot '.join(eparts[:-1])
|
|
spoken_email = f"{user} at {ename} {tlds.get(etld, f'dot {etld}')}"
|
|
|
|
examples.append(make(
|
|
f"[en] Contact us at {email}.",
|
|
f"Contact us at {spoken_email}."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Napisz do nas na {email}.",
|
|
f"Napisz do nas na {spoken_email}."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_symbols(count=60):
|
|
"""Symbol expansion."""
|
|
symbol_map_en = {
|
|
'@': 'at', '&': 'and', '#': 'hash', '%': 'percent',
|
|
'+': 'plus', '=': 'equals', '/': 'slash', '\\': 'backslash',
|
|
'*': 'asterisk', '^': 'caret', '~': 'tilde', '|': 'pipe',
|
|
'<': 'less than', '>': 'greater than',
|
|
}
|
|
symbol_map_pl = {
|
|
'@': 'małpa', '&': 'i', '#': 'hash', '%': 'procent',
|
|
'+': 'plus', '=': 'równa się', '/': 'slash', '\\': 'backslash',
|
|
'*': 'gwiazdka', '^': 'daszek', '~': 'tylda', '|': 'kreska pionowa',
|
|
'<': 'mniejsze niż', '>': 'większe niż',
|
|
}
|
|
|
|
examples = []
|
|
for _ in range(count):
|
|
sym = random.choice(list(symbol_map_en.keys()))
|
|
examples.append(make(
|
|
f"[en] Press {sym} to continue.",
|
|
f"Press {symbol_map_en[sym]} to continue."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Naciśnij {sym} aby kontynuować.",
|
|
f"Naciśnij {symbol_map_pl[sym]} aby kontynuować."
|
|
))
|
|
|
|
# Math expressions
|
|
for _ in range(20):
|
|
a = random.randint(1, 100)
|
|
b = random.randint(1, 100)
|
|
op = random.choice(['+', '-', '*', '/'])
|
|
op_en = {'+': 'plus', '-': 'minus', '*': 'times', '/': 'divided by'}
|
|
op_pl = {'+': 'plus', '-': 'minus', '*': 'razy', '/': 'podzielone przez'}
|
|
|
|
examples.append(make(
|
|
f"[en] Calculate {a} {op} {b}.",
|
|
f"Calculate {n2w(a, 'en')} {op_en[op]} {n2w(b, 'en')}."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Oblicz {a} {op} {b}.",
|
|
f"Oblicz {n2w(a, 'pl')} {op_pl[op]} {n2w(b, 'pl')}."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_markdown(count=60):
|
|
"""Strip markdown formatting."""
|
|
examples = []
|
|
md_pairs_en = [
|
|
("**important** update", "important update"),
|
|
("the *quick* brown fox", "the quick brown fox"),
|
|
("# Main Heading", "Main Heading"),
|
|
("## Section Two", "Section Two"),
|
|
("### Subsection", "Subsection"),
|
|
("Check `config.json` file", "Check config dot json file"),
|
|
("Use ```python\nprint()``` here", "Use print here"),
|
|
("Visit [our site](https://example.com)", "Visit our site"),
|
|
("- item one\n- item two", "item one, item two"),
|
|
("1. first step\n2. second step", "first step, second step"),
|
|
("> This is a quote", "This is a quote"),
|
|
("~~deleted~~ text", "deleted text"),
|
|
("__underline__ this", "underline this"),
|
|
("***bold italic*** text", "bold italic text"),
|
|
("some `inline code` here", "some inline code here"),
|
|
]
|
|
md_pairs_pl = [
|
|
("**ważna** aktualizacja", "ważna aktualizacja"),
|
|
("*szybki* brązowy lis", "szybki brązowy lis"),
|
|
("# Główny Nagłówek", "Główny Nagłówek"),
|
|
("## Sekcja Druga", "Sekcja Druga"),
|
|
("Sprawdź plik `config.json`", "Sprawdź plik config dot json"),
|
|
("Odwiedź [naszą stronę](https://example.com)", "Odwiedź naszą stronę"),
|
|
("- element pierwszy\n- element drugi", "element pierwszy, element drugi"),
|
|
("1. pierwszy krok\n2. drugi krok", "pierwszy krok, drugi krok"),
|
|
("> To jest cytat", "To jest cytat"),
|
|
("~~usunięty~~ tekst", "usunięty tekst"),
|
|
("***pogrubiona kursywa*** tekst", "pogrubiona kursywa tekst"),
|
|
]
|
|
|
|
for _ in range(count):
|
|
raw, norm = random.choice(md_pairs_en)
|
|
examples.append(make(f"[en] {raw}", norm))
|
|
|
|
raw, norm = random.choice(md_pairs_pl)
|
|
examples.append(make(f"[pl] {raw}", norm))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_phone_numbers(count=40):
|
|
"""Phone number normalization — read as digit groups."""
|
|
examples = []
|
|
for _ in range(count):
|
|
# Polish phone: +48 XXX XXX XXX
|
|
g1 = random.randint(100, 999)
|
|
g2 = random.randint(100, 999)
|
|
g3 = random.randint(100, 999)
|
|
phone_pl = f"+48 {g1} {g2} {g3}"
|
|
spoken_digits = lambda n, lang: ' '.join(n2w(int(d), lang) for d in str(n))
|
|
pl_spoken = f"plus czterdzieści osiem, {spoken_digits(g1, 'pl')}, {spoken_digits(g2, 'pl')}, {spoken_digits(g3, 'pl')}"
|
|
en_spoken = f"plus forty-eight, {spoken_digits(g1, 'en')}, {spoken_digits(g2, 'en')}, {spoken_digits(g3, 'en')}"
|
|
|
|
examples.append(make(
|
|
f"[pl] Zadzwoń pod {phone_pl}.",
|
|
f"Zadzwoń pod {pl_spoken}."
|
|
))
|
|
examples.append(make(
|
|
f"[en] Call {phone_pl}.",
|
|
f"Call {en_spoken}."
|
|
))
|
|
|
|
# US phone: (XXX) XXX-XXXX
|
|
a = random.randint(200, 999)
|
|
b = random.randint(100, 999)
|
|
c = random.randint(1000, 9999)
|
|
phone_us = f"({a}) {b}-{c}"
|
|
en_us = f"{spoken_digits(a, 'en')}, {spoken_digits(b, 'en')}, {spoken_digits(c, 'en')}"
|
|
|
|
examples.append(make(
|
|
f"[en] Reach us at {phone_us}.",
|
|
f"Reach us at {en_us}."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_versions_ips(count=40):
|
|
"""Version numbers and IP addresses."""
|
|
examples = []
|
|
for _ in range(count):
|
|
# Version: v3.2.1
|
|
major = random.randint(0, 20)
|
|
minor = random.randint(0, 30)
|
|
patch = random.randint(0, 99)
|
|
ver = f"v{major}.{minor}.{patch}"
|
|
en_ver = f"version {n2w(major, 'en')} point {n2w(minor, 'en')} point {n2w(patch, 'en')}"
|
|
pl_ver = f"wersja {n2w(major, 'pl')} kropka {n2w(minor, 'pl')} kropka {n2w(patch, 'pl')}"
|
|
|
|
examples.append(make(
|
|
f"[en] Upgrade to {ver}.",
|
|
f"Upgrade to {en_ver}."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Zaktualizuj do {ver}.",
|
|
f"Zaktualizuj do {pl_ver}."
|
|
))
|
|
|
|
# IP: 192.168.1.100
|
|
octets = [random.randint(0, 255) for _ in range(4)]
|
|
ip = '.'.join(str(o) for o in octets)
|
|
en_ip = ' dot '.join(n2w(o, 'en') for o in octets)
|
|
pl_ip = ' kropka '.join(n2w(o, 'pl') for o in octets)
|
|
|
|
examples.append(make(
|
|
f"[en] Connect to {ip}.",
|
|
f"Connect to {en_ip}."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Połącz się z {ip}.",
|
|
f"Połącz się z {pl_ip}."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_passthrough(count=80):
|
|
"""Clean text that should pass through unchanged — negative examples."""
|
|
clean_en = [
|
|
"Hello, how are you doing today?",
|
|
"The quick brown fox jumps over the lazy dog.",
|
|
"Please close the door when you leave.",
|
|
"She walked slowly through the garden.",
|
|
"The meeting went well yesterday.",
|
|
"I think we should reconsider the approach.",
|
|
"Thank you for your help with this project.",
|
|
"The weather is beautiful this morning.",
|
|
"Let me know if you need anything else.",
|
|
"We appreciate your continued support.",
|
|
"The results exceeded our expectations.",
|
|
"Can you send me the updated report?",
|
|
"The new design looks great.",
|
|
"I'll follow up with you next week.",
|
|
"The system is running smoothly now.",
|
|
]
|
|
clean_pl = [
|
|
"Cześć, jak się masz?",
|
|
"Szybki brązowy lis przeskakuje nad leniwym psem.",
|
|
"Proszę zamknąć drzwi, gdy wychodzisz.",
|
|
"Szła powoli przez ogród.",
|
|
"Spotkanie przebiegło dobrze wczoraj.",
|
|
"Myślę, że powinniśmy ponownie rozważyć podejście.",
|
|
"Dziękuję za pomoc przy tym projekcie.",
|
|
"Pogoda jest piękna tego ranka.",
|
|
"Daj mi znać, jeśli potrzebujesz czegoś jeszcze.",
|
|
"Doceniamy wasze ciągłe wsparcie.",
|
|
"Wyniki przekroczyły nasze oczekiwania.",
|
|
"Czy możesz wysłać mi zaktualizowany raport?",
|
|
"Nowy projekt wygląda świetnie.",
|
|
"Skontaktuję się z tobą w przyszłym tygodniu.",
|
|
"System działa teraz płynnie.",
|
|
]
|
|
|
|
examples = []
|
|
for _ in range(count):
|
|
text = random.choice(clean_en)
|
|
examples.append(make(f"[en] {text}", text))
|
|
text = random.choice(clean_pl)
|
|
examples.append(make(f"[pl] {text}", text))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_mixed(count=150):
|
|
"""Complex mixed examples with multiple normalization needs."""
|
|
examples = []
|
|
|
|
mixed_en = [
|
|
(
|
|
"[en] The CPU usage hit 95% at 14:30, costing us $4,500 in SLA penalties.",
|
|
"The C P U usage hit ninety-five percent at fourteen thirty, costing us four thousand five hundred dollars in S L A penalties."
|
|
),
|
|
(
|
|
"[en] **WARNING**: Server at 192.168.1.42 returned HTTP 503 error.",
|
|
"WARNING: Server at one hundred ninety-two dot one hundred sixty-eight dot one dot forty-two returned H T T P five hundred three error."
|
|
),
|
|
(
|
|
"[en] Send the PDF to john@example.com by 05/30/2026.",
|
|
"Send the P D F to john at example dot com by May thirtieth, twenty twenty-six."
|
|
),
|
|
(
|
|
"[en] The NASA rover traveled 3.7km at 0.5km/h on Mars.",
|
|
"The NASA rover traveled three point seven kilometers at zero point five kilometers per hour on Mars."
|
|
),
|
|
(
|
|
"[en] Update to v2.4.1 — fixes #347 & improves GPU performance by 12%.",
|
|
"Update to version two point four point one — fixes number three hundred forty-seven and improves G P U performance by twelve percent."
|
|
),
|
|
(
|
|
"[en] The FBI & CIA issued a joint FAQ about the VPN breach affecting 10,000+ users.",
|
|
"The F B I and C I A issued a joint F A Q about the V P N breach affecting ten thousand plus users."
|
|
),
|
|
(
|
|
"[en] Meeting at 3:00 PM in room 401. Budget: €250,000. Contact HR ASAP.",
|
|
"Meeting at three PM in room four hundred one. Budget: two hundred fifty thousand euros. Contact H R A S A P."
|
|
),
|
|
(
|
|
"[en] The LED display shows -5°C. The ATM is 200m away on 3rd street.",
|
|
"The L E D display shows minus five degrees Celsius. The A T M is two hundred meters away on third street."
|
|
),
|
|
(
|
|
"[en] Check https://docs.python.org/api/v3 for the SDK documentation.",
|
|
"Check docs dot python dot org slash api slash v three for the S D K documentation."
|
|
),
|
|
(
|
|
"[en] Flight BA247 departs at 08:15 from gate 12B. ETA: 11:30.",
|
|
"Flight B A two forty-seven departs at eight fifteen from gate twelve B. E T A: eleven thirty."
|
|
),
|
|
(
|
|
"[en] The IoT device uses 2.4GHz WiFi & draws 5W at 12V DC.",
|
|
"The I o T device uses two point four gigahertz WiFi and draws five watts at twelve volts D C."
|
|
),
|
|
(
|
|
"[en] She recieved the USB drive with 256GB of ML training data.",
|
|
"She received the U S B drive with two hundred fifty-six gigabytes of M L training data."
|
|
),
|
|
(
|
|
"[en] Callsign: Tango Alpha Foxtrot. Coordinates: 52°N, 21°E.",
|
|
"Callsign: T A F. Coordinates: fifty-two degrees north, twenty-one degrees east."
|
|
),
|
|
]
|
|
|
|
mixed_pl = [
|
|
(
|
|
"[pl] Użycie CPU osiągnęło 95% o 14:30, kosztując nas $4,500 kar SLA.",
|
|
"Użycie C P U osiągnęło dziewięćdziesiąt pięć procent o czternasta trzydzieści, kosztując nas cztery tysiące pięćset dolarów kar S L A."
|
|
),
|
|
(
|
|
"[pl] **UWAGA**: Serwer 192.168.1.42 zwrócił błąd HTTP 503.",
|
|
"UWAGA: Serwer sto dziewięćdziesiąt dwa kropka sto sześćdziesiąt osiem kropka jeden kropka czterdzieści dwa zwrócił błąd H T T P pięćset trzy."
|
|
),
|
|
(
|
|
"[pl] Wyślij PDF na kontakt@firma.pl do 30.05.2026.",
|
|
"Wyślij P D F na kontakt małpa firma dot P L do trzydziestego maja dwa tysiące dwudziestego szóstego."
|
|
),
|
|
(
|
|
"[pl] Łazik NASA przejechał 3,7km z prędkością 0,5km/h na Marsie.",
|
|
"Łazik NASA przejechał trzy przecinek siedem kilometrów z prędkością zero przecinek pięć kilometrów na godzinę na Marsie."
|
|
),
|
|
(
|
|
"[pl] Aktualizacja do v2.4.1 — naprawia #347 & poprawia wydajność GPU o 12%.",
|
|
"Aktualizacja do wersja dwa kropka cztery kropka jeden — naprawia numer trzysta czterdzieści siedem i poprawia wydajność G P U o dwanaście procent."
|
|
),
|
|
(
|
|
"[pl] ABW i CBA wydały wspólne FAQ o naruszeniu VPN dotyczącym 10000+ użytkowników.",
|
|
"A B W i C B A wydały wspólne F A Q o naruszeniu V P N dotyczącym dziesięć tysięcy plus użytkowników."
|
|
),
|
|
(
|
|
"[pl] Spotkanie o 15:00 w pokoju 401. Budżet: €250000. Skontaktuj się z HR jak najszybciej.",
|
|
"Spotkanie o piętnaście zero zero w pokoju czterysta jeden. Budżet: dwieście pięćdziesiąt tysięcy euro. Skontaktuj się z H R jak najszybciej."
|
|
),
|
|
(
|
|
"[pl] Wyświetlacz LED pokazuje -5°C. Bankomat jest 200m dalej na 3. ulicy.",
|
|
"Wyświetlacz L E D pokazuje minus pięć stopni Celsjusza. Bankomat jest dwieście metrów dalej na trzeciej ulicy."
|
|
),
|
|
(
|
|
"[pl] Sprawdź https://docs.python.org/api/v3 po dokumentację SDK.",
|
|
"Sprawdź docs dot python dot org slash api slash v trzy po dokumentację S D K."
|
|
),
|
|
(
|
|
"[pl] Lot BA247 odlatuje o 08:15 z bramki 12B. Przewidywany czas: 11:30.",
|
|
"Lot B A dwieście czterdzieści siedem odlatuje o osiem piętnaście z bramki dwanaście B. Przewidywany czas: jedenaście trzydzieści."
|
|
),
|
|
(
|
|
"[pl] Urządzenie IoT używa WiFi 2,4GHz i pobiera 5W przy 12V DC.",
|
|
"Urządzenie I o T używa WiFi dwa przecinek cztery gigaherców i pobiera pięć watów przy dwanaście woltów D C."
|
|
),
|
|
(
|
|
"[pl] Kupiła pendrive USB z 256GB danych treningowych ML za 149 zł.",
|
|
"Kupiła pendrive U S B z dwieście pięćdziesiąt sześć gigabajtów danych treningowych M L za sto czterdzieści dziewięć złotych."
|
|
),
|
|
(
|
|
"[pl] Znak wywoławczy: Tango Alfa Foxtrot. Współrzędne: 52°N, 21°E.",
|
|
"Znak wywoławczy: T A F. Współrzędne: pięćdziesiąt dwa stopnie północ, dwadzieścia jeden stopni wschód."
|
|
),
|
|
]
|
|
|
|
for pair in mixed_en:
|
|
examples.append(make(pair[0], pair[1]))
|
|
for pair in mixed_pl:
|
|
examples.append(make(pair[0], pair[1]))
|
|
|
|
# Generate more mixed by combining elements
|
|
for _ in range(count - len(mixed_en) - len(mixed_pl)):
|
|
n = random.randint(100, 9999)
|
|
acr = random.choice(list(SPELLED_ACRONYMS.keys()))
|
|
acr_sp = SPELLED_ACRONYMS[acr]
|
|
sym = random.choice(['$', '€', 'zł', '£'])
|
|
amount = random.randint(10, 99999)
|
|
|
|
if random.random() < 0.5:
|
|
cur_raw = f"{sym}{amount:,}" if sym in {'$', '€', '£'} else f"{amount:,} {sym}"
|
|
examples.append(make(
|
|
f"[en] The {acr} report shows {n} entries totaling {cur_raw}.",
|
|
f"The {acr_sp} report shows {n2w(n, 'en')} entries totaling {fmt_currency(amount, sym, 'en')}."
|
|
))
|
|
else:
|
|
cur_raw = f"{sym}{amount:,}" if sym in {'$', '€', '£'} else f"{amount:,} {sym}"
|
|
examples.append(make(
|
|
f"[pl] Raport {acr} pokazuje {n} wpisów na łącznie {cur_raw}.",
|
|
f"Raport {acr_sp} pokazuje {n2w(n, 'pl')} wpisów na łącznie {fmt_currency(amount, sym, 'pl')}."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_abbreviations(count=40):
|
|
"""Common abbreviations."""
|
|
abbrevs_en = {
|
|
'Mr.': 'Mister', 'Mrs.': 'Misses', 'Dr.': 'Doctor',
|
|
'St.': 'Saint', 'Ave.': 'Avenue', 'Blvd.': 'Boulevard',
|
|
'Jr.': 'Junior', 'Sr.': 'Senior', 'Prof.': 'Professor',
|
|
'vs.': 'versus', 'etc.': 'et cetera', 'e.g.': 'for example',
|
|
'i.e.': 'that is', 'approx.': 'approximately',
|
|
'dept.': 'department', 'govt.': 'government',
|
|
'inc.': 'incorporated', 'corp.': 'corporation',
|
|
}
|
|
abbrevs_pl = {
|
|
'dr': 'doktor', 'prof.': 'profesor', 'mgr': 'magister',
|
|
'inż.': 'inżynier', 'ul.': 'ulica', 'al.': 'aleja',
|
|
'pl.': 'plac', 'os.': 'osiedle', 'nr': 'numer',
|
|
'tel.': 'telefon', 'godz.': 'godzina', 'ok.': 'około',
|
|
'np.': 'na przykład', 'tj.': 'to jest', 'itd.': 'i tak dalej',
|
|
'itp.': 'i tym podobne', 'wg': 'według', 'dot.': 'dotyczący',
|
|
}
|
|
|
|
examples = []
|
|
for _ in range(count):
|
|
abbr, full = random.choice(list(abbrevs_en.items()))
|
|
examples.append(make(
|
|
f"[en] {abbr} Smith will arrive at 5.",
|
|
f"{full} Smith will arrive at five."
|
|
))
|
|
|
|
abbr, full = random.choice(list(abbrevs_pl.items()))
|
|
examples.append(make(
|
|
f"[pl] {abbr} Kowalski przyjedzie o 5.",
|
|
f"{full} Kowalski przyjedzie o piąta."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_contractions(count=40):
|
|
"""English contractions — normalize for TTS clarity."""
|
|
contractions = {
|
|
"don't": "do not", "doesn't": "does not", "didn't": "did not",
|
|
"won't": "will not", "wouldn't": "would not", "couldn't": "could not",
|
|
"shouldn't": "should not", "can't": "cannot", "isn't": "is not",
|
|
"aren't": "are not", "wasn't": "was not", "weren't": "were not",
|
|
"hasn't": "has not", "haven't": "have not", "hadn't": "had not",
|
|
"I'm": "I am", "I've": "I have", "I'll": "I will", "I'd": "I would",
|
|
"you're": "you are", "you've": "you have", "you'll": "you will",
|
|
"he's": "he is", "she's": "she is", "it's": "it is",
|
|
"we're": "we are", "we've": "we have", "we'll": "we will",
|
|
"they're": "they are", "they've": "they have", "they'll": "they will",
|
|
"that's": "that is", "there's": "there is", "here's": "here is",
|
|
"let's": "let us", "who's": "who is", "what's": "what is",
|
|
}
|
|
|
|
templates = [
|
|
"{} the right approach.",
|
|
"I think {} a good idea.",
|
|
"{} going to work out.",
|
|
"They said {} ready yet.",
|
|
"{} what we expected.",
|
|
]
|
|
|
|
examples = []
|
|
for _ in range(count):
|
|
contr, expanded = random.choice(list(contractions.items()))
|
|
tpl = random.choice(templates)
|
|
examples.append(make(
|
|
f"[en] {tpl.format(contr)}",
|
|
tpl.format(expanded)
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
def gen_large_numbers_shorthand(count=40):
|
|
"""Large number shorthands: 4.5M, 2.3B, 1.2K etc."""
|
|
suffixes = {
|
|
'K': (1_000, 'thousand', 'tysięcy'),
|
|
'M': (1_000_000, 'million', 'milionów'),
|
|
'B': (1_000_000_000, 'billion', 'miliardów'),
|
|
}
|
|
|
|
examples = []
|
|
for _ in range(count):
|
|
suffix, (mult, en_word, pl_word) = random.choice(list(suffixes.items()))
|
|
whole = random.randint(1, 99)
|
|
frac = random.choice([0, random.randint(1, 9)])
|
|
|
|
if frac:
|
|
raw = f"{whole}.{frac}{suffix}"
|
|
en_spoken = f"{n2w(whole, 'en')} point {n2w(frac, 'en')} {en_word}"
|
|
pl_spoken = f"{n2w(whole, 'pl')} przecinek {n2w(frac, 'pl')} {pl_word}"
|
|
else:
|
|
raw = f"{whole}{suffix}"
|
|
en_spoken = f"{n2w(whole, 'en')} {en_word}"
|
|
pl_spoken = f"{n2w(whole, 'pl')} {pl_word}"
|
|
|
|
for sym in ['$', '€', 'zł']:
|
|
if sym in {'$', '€'}:
|
|
cur_raw = f"{sym}{raw}"
|
|
else:
|
|
cur_raw = f"{raw} {sym}"
|
|
|
|
if frac:
|
|
val_for_cur = whole # approximate — the model should learn the pattern
|
|
else:
|
|
val_for_cur = whole
|
|
|
|
examples.append(make(
|
|
f"[en] Revenue reached {cur_raw}.",
|
|
f"Revenue reached {en_spoken} {en_currency_name(sym, 2)}."
|
|
))
|
|
examples.append(make(
|
|
f"[pl] Przychody osiągnęły {cur_raw}.",
|
|
f"Przychody osiągnęły {pl_spoken} {pl_currency_name(sym, 5)}."
|
|
))
|
|
|
|
return examples
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# MAIN
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
random.seed(SEED)
|
|
all_examples = []
|
|
|
|
generators = [
|
|
("numbers", gen_numbers, 200),
|
|
("negatives_decimals", gen_negatives_decimals, 80),
|
|
("ordinals", gen_ordinals, 80),
|
|
("percentages", gen_percentages, 60),
|
|
("dates", gen_dates, 100),
|
|
("times", gen_times, 80),
|
|
("currency", gen_currency, 120),
|
|
("units", gen_units, 80),
|
|
("temperatures", gen_temperatures, 40),
|
|
("acronyms", gen_acronyms, 150),
|
|
("nato", gen_nato, 100),
|
|
("spelling", gen_spelling_corrections, 80),
|
|
("urls_emails", gen_urls_emails, 80),
|
|
("symbols", gen_symbols, 60),
|
|
("markdown", gen_markdown, 60),
|
|
("phone_numbers", gen_phone_numbers, 40),
|
|
("versions_ips", gen_versions_ips, 40),
|
|
("passthrough", gen_passthrough, 80),
|
|
("mixed", gen_mixed, 150),
|
|
("abbreviations", gen_abbreviations, 40),
|
|
("contractions", gen_contractions, 40),
|
|
("large_numbers", gen_large_numbers_shorthand, 40),
|
|
]
|
|
|
|
for name, gen_fn, count in generators:
|
|
examples = gen_fn(count)
|
|
print(f" {name}: {len(examples)} examples")
|
|
all_examples.extend(examples)
|
|
|
|
random.shuffle(all_examples)
|
|
|
|
with OUTPUT.open("w", encoding="utf-8") as f:
|
|
for ex in all_examples:
|
|
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
|
|
|
print(f"\nTotal: {len(all_examples)} examples -> {OUTPUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|