fix: Qwen3.5 VL processor has no .encode() — use .tokenizer attr

This commit is contained in:
marauder-actual
2026-05-26 14:28:08 +02:00
parent 5388df0075
commit 60f47d0379
+3 -1
View File
@@ -97,6 +97,8 @@ def fix_tool_calls(messages):
def load_and_format(path):
"""Load JSONL manually — pyarrow chokes on mixed tool_calls argument types."""
from datasets import Dataset
# Qwen3.5 is a VL model — from_pretrained returns a Processor, not a Tokenizer
_enc = tokenizer.tokenizer if hasattr(tokenizer, 'tokenizer') else tokenizer
texts = []
skipped = 0
with open(path) as f:
@@ -119,7 +121,7 @@ def load_and_format(path):
tokenize=False,
add_generation_prompt=False,
)
if len(tokenizer.encode(text)) <= MAX_SEQ:
if len(_enc.encode(text)) <= MAX_SEQ:
texts.append(text)
else:
skipped += 1