-
Notifications
You must be signed in to change notification settings - Fork 681
Open
Description
Checklist
- 1. I have searched related issues but cannot get the expected help.
- 2. The bug has not been fixed in the latest version.
- 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
Describe the bug
Can InternVL3 accept images that are not 448x448? I am following the official demo here and tried changing the input_size
from 448
to 224
in the load_video
function, but this fails with a tensor shape mismatch error (see below).
Reproduction
Minimal example:
import torch
from torchvision import transforms as T
from PIL import Image
from decord import VideoReader, cpu
from transformers import AutoTokenizer, AutoModel
path = "./pretrained/InternVL3-8B"
model = (
AutoModel.from_pretrained(
path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,
use_flash_attn=True, trust_remote_code=True,
).eval().cuda()
)
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
def build_transform(image_size):
return T.Compose(
[
T.Lambda(lambda img: img.convert("RGB")),
T.Resize(image_size, interpolation=T.InterpolationMode.BICUBIC),
T.CenterCrop(image_size),
T.ToTensor(),
T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
]
)
def load_video_frames(video_path, num_frames, image_size):
vr = VideoReader(video_path, ctx=cpu(0))
total_frames = len(vr)
indices = torch.linspace(0, total_frames - 1, steps=num_frames).long().tolist()
transform = build_transform(image_size)
frames = []
for idx in indices:
img = Image.fromarray(vr[idx].asnumpy())
frames.append(transform(img))
pixel_values = torch.stack(frames) # [T, 3, H, W]
return pixel_values, len(frames)
video_path = "videos/red-panda.mp4"
pixel_values, num_frames = load_video_frames(video_path, num_frames=8, image_size=224)
pixel_values = pixel_values.to(torch.bfloat16).cuda()
question = "<image>\n" * num_frames + "Describe the video."
generation_config = dict(max_new_tokens=512, do_sample=True)
response, history = model.chat(
tokenizer,
pixel_values,
question,
generation_config,
num_patches_list=[1] * num_frames,
history=None,
return_history=True,
)
print(f"User: {question}\nAssistant: {response}")
Output:
Traceback (most recent call last):
File "/data/group_data/neuroagents_lab/curious_wm/Minecraft-VLM-Test/upload.py", line 57, in <module>
response, history = model.chat(
^^^^^^^^^^^
File "/data/user_data/yusenh/cache/huggingface/modules/transformers_modules/InternVL3-8B/modeling_internvl_chat.py", line 291, in chat
generation_output = self.generate(
^^^^^^^^^^^^^^
File "/data/group_data/neuroagents_lab/curious_wm/Minecraft-VLM-Test/.conda-env/mc-vlm/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/data/user_data/yusenh/cache/huggingface/modules/transformers_modules/InternVL3-8B/modeling_internvl_chat.py", line 334, in generate
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
~~~~~~~~~~~~^^^^^^^^^^
RuntimeError: shape mismatch: value tensor of shape [512, 3584] cannot be broadcast to indexing result of shape [2048, 3584]
Environment
name: internvl3-conda
channels:
- conda-forge
- https://repo.anaconda.com/pkgs/main
- https://repo.anaconda.com/pkgs/r
dependencies:
- python=3.11
- transformers=4.54.1
- pytorch=2.6.0
- timm=1.0.19
- einops=0.8.1
Metadata
Metadata
Assignees
Labels
No labels