diff --git a/train_v4.py b/train_v4.py index 1bed668..ed0e18f 100644 --- a/train_v4.py +++ b/train_v4.py @@ -97,6 +97,8 @@ def fix_tool_calls(messages): def load_and_format(path): """Load JSONL manually — pyarrow chokes on mixed tool_calls argument types.""" from datasets import Dataset + # Qwen3.5 is a VL model — from_pretrained returns a Processor, not a Tokenizer + _enc = tokenizer.tokenizer if hasattr(tokenizer, 'tokenizer') else tokenizer texts = [] skipped = 0 with open(path) as f: @@ -119,7 +121,7 @@ def load_and_format(path): tokenize=False, add_generation_prompt=False, ) - if len(tokenizer.encode(text)) <= MAX_SEQ: + if len(_enc.encode(text)) <= MAX_SEQ: texts.append(text) else: skipped += 1