From 60f47d03798f996d54a71ef1414101b1ef7bca9a Mon Sep 17 00:00:00 2001 From: marauder-actual Date: Tue, 26 May 2026 14:28:08 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20Qwen3.5=20VL=20processor=20has=20no=20.e?= =?UTF-8?q?ncode()=20=E2=80=94=20use=20.tokenizer=20attr?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- train_v4.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/train_v4.py b/train_v4.py index 1bed668..ed0e18f 100644 --- a/train_v4.py +++ b/train_v4.py @@ -97,6 +97,8 @@ def fix_tool_calls(messages): def load_and_format(path): """Load JSONL manually — pyarrow chokes on mixed tool_calls argument types.""" from datasets import Dataset + # Qwen3.5 is a VL model — from_pretrained returns a Processor, not a Tokenizer + _enc = tokenizer.tokenizer if hasattr(tokenizer, 'tokenizer') else tokenizer texts = [] skipped = 0 with open(path) as f: @@ -119,7 +121,7 @@ def load_and_format(path): tokenize=False, add_generation_prompt=False, ) - if len(tokenizer.encode(text)) <= MAX_SEQ: + if len(_enc.encode(text)) <= MAX_SEQ: texts.append(text) else: skipped += 1