fix: Qwen3.5 VL processor has no .encode() — use .tokenizer attr
This commit is contained in:
+3
-1
@@ -97,6 +97,8 @@ def fix_tool_calls(messages):
|
|||||||
def load_and_format(path):
|
def load_and_format(path):
|
||||||
"""Load JSONL manually — pyarrow chokes on mixed tool_calls argument types."""
|
"""Load JSONL manually — pyarrow chokes on mixed tool_calls argument types."""
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
|
# Qwen3.5 is a VL model — from_pretrained returns a Processor, not a Tokenizer
|
||||||
|
_enc = tokenizer.tokenizer if hasattr(tokenizer, 'tokenizer') else tokenizer
|
||||||
texts = []
|
texts = []
|
||||||
skipped = 0
|
skipped = 0
|
||||||
with open(path) as f:
|
with open(path) as f:
|
||||||
@@ -119,7 +121,7 @@ def load_and_format(path):
|
|||||||
tokenize=False,
|
tokenize=False,
|
||||||
add_generation_prompt=False,
|
add_generation_prompt=False,
|
||||||
)
|
)
|
||||||
if len(tokenizer.encode(text)) <= MAX_SEQ:
|
if len(_enc.encode(text)) <= MAX_SEQ:
|
||||||
texts.append(text)
|
texts.append(text)
|
||||||
else:
|
else:
|
||||||
skipped += 1
|
skipped += 1
|
||||||
|
|||||||
Reference in New Issue
Block a user