eo1-qwen25_vl-bridge / processing_eo1.py

Upload folder using huggingface_hub

dae45d7 verified 6 months ago

15.6 kB

	# Copyright 2025 EO-Robotics Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os
	from typing import Union

	import numpy as np
	import torch
	from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
	from lerobot.constants import OBS_STATE
	from lerobot.datasets.utils import cast_stats_to_numpy
	from lerobot.policies.normalize import Normalize, Unnormalize
	from transformers.feature_extraction_utils import BatchFeature
	from transformers.image_utils import ImageInput
	from transformers.processing_utils import (
	ImagesKwargs,
	ProcessingKwargs,
	ProcessorMixin,
	TextKwargs,
	Unpack,
	VideosKwargs,
	)
	from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
	from transformers.video_utils import VideoInput

	os.environ["TOKENIZERS_PARALLELISM"] = "0"

	"""constants"""
	DEFAULT_IMAGE_TOKEN = "<\|image_pad\|>"
	DEFAULT_VIDEO_TOKEN = "<\|video_pad\|>"
	VISION_START_TOKEN = "<\|vision_start\|>"
	VISION_END_TOKEN = "<\|vision_end\|>"

	ACTION_START_TOKEN = "<\|action_start\|>"
	DEFAULT_ACTION_TOKEN = "<\|action_pad\|>"
	PASS_ACTION_TOKEN = "<\|action_pass\|>"
	ACTION_END_TOKEN = "<\|action_end\|>"

	STATE_START_TOKEN = "<\|state_start\|>"
	DEFAULT_STATE_TOKEN = "<\|state_pad\|>"
	STATE_END_TOKEN = "<\|state_end\|>"
	TASK_VLA_TOKEN = "<\|vla\|>"


	RobotInput = Union[np.ndarray, "torch.Tensor", list[np.ndarray], list["torch.Tensor"]]


	class EO1VisionVideosProcessorKwargs(VideosKwargs, total=False):
	fps: list[float] \| float


	class EO1VisionImagesKwargs(ImagesKwargs):
	min_pixels: int \| None
	max_pixels: int \| None
	patch_size: int \| None
	temporal_patch_size: int \| None
	merge_size: int \| None


	class EO1VisionTextKwargs(TextKwargs):
	noise_token_num: int \| None
	noise_prompt: str \| None


	class EO1VisionProcessorKwargs(ProcessingKwargs, total=False):
	text_kwargs: EO1VisionTextKwargs
	images_kwargs: EO1VisionImagesKwargs
	videos_kwargs: EO1VisionVideosProcessorKwargs
	_defaults = {
	"text_kwargs": {
	"padding": False,
	"return_mm_token_type_ids": False,
	},
	}


	class EO1VisionProcessor(ProcessorMixin):
	"""EO1Vision Processor for Image, Text, Video, and Robotic Action Processing"""

	attributes = ["image_processor", "tokenizer", "video_processor"]
	valid_kwargs = ["chat_template"]
	image_processor_class = "AutoImageProcessor"
	video_processor_class = "AutoVideoProcessor"
	tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")

	def __init__(
	self,
	image_processor=None,
	tokenizer=None,
	video_processor=None,
	chat_template=None,
	robot_config=None,
	**kwargs,
	):
	self.image_token = getattr(tokenizer, "image_token", DEFAULT_IMAGE_TOKEN)
	self.video_token = getattr(tokenizer, "video_token", DEFAULT_VIDEO_TOKEN)
	self.action_token = getattr(tokenizer, "action_token", DEFAULT_ACTION_TOKEN)
	self.state_token = getattr(tokenizer, "state_token", DEFAULT_STATE_TOKEN)

	# robot policy
	self.action_token_id = tokenizer.convert_tokens_to_ids(DEFAULT_ACTION_TOKEN) or 151666
	self.action_pass_id = tokenizer.convert_tokens_to_ids(PASS_ACTION_TOKEN) or 151667
	self.robot_config = robot_config or {}
	self.set_normalization(self.robot_config)

	super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)

	def set_normalization(self, robot_config: dict):
	features, stats, state_mode = (
	robot_config.get("features"),
	robot_config.get("stats"),
	robot_config.get("state_mode"),
	)
	if None in [features, stats, state_mode]:
	return
	else:
	normalization_mapping = {
	"STATE": NormalizationMode(state_mode),
	"ACTION": NormalizationMode(state_mode),
	}
	normalize_inputs, unnormalize_outputs = {}, {}
	for repo_id, fea in features.items():
	stat = cast_stats_to_numpy(stats[repo_id])
	fea = dataset_to_policy_features(fea)

	input_features = {k: v for k, v in fea.items() if v.type == FeatureType.STATE}
	output_features = {k: v for k, v in fea.items() if v.type == FeatureType.ACTION}

	normalize_inputs[repo_id] = Normalize(input_features, normalization_mapping, stat)
	unnormalize_outputs[repo_id] = Unnormalize(output_features, normalization_mapping, stat)

	self.robot_config = dict(robot_config)
	self.normalize_inputs, self.unnormalize_outputs = normalize_inputs, unnormalize_outputs

	def __call__(
	self,
	images: ImageInput = None,
	text: TextInput \| PreTokenizedInput \| list[TextInput] \| list[PreTokenizedInput] = None,
	videos: VideoInput = None,
	states: RobotInput = None,
	actions: RobotInput = None,
	**kwargs: Unpack[EO1VisionProcessorKwargs],
	) -> BatchFeature:
	output_kwargs = self._merge_kwargs(
	EO1VisionProcessorKwargs,
	tokenizer_init_kwargs=self.tokenizer.init_kwargs,
	**kwargs,
	)

	noise_token_num = output_kwargs["text_kwargs"].pop("noise_token_num", None)
	output_kwargs["text_kwargs"].pop("noise_prompt", None)

	image_inputs = videos_inputs = {}
	if images is not None:
	image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
	image_grid_thw = image_inputs["image_grid_thw"]
	else:
	image_inputs = {}
	image_grid_thw = None

	if videos is not None:
	fps = output_kwargs["videos_kwargs"].get("fps", 2.0)
	videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
	video_grid_thw = videos_inputs["video_grid_thw"]
	if isinstance(fps, (int, float)):
	second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw)
	elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
	second_per_grid_ts = [self.video_processor.temporal_patch_size / tmp for tmp in fps]
	else:
	raise ValueError(
	f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to \
	the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
	)
	videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
	else:
	videos_inputs = {}
	video_grid_thw = None

	if not isinstance(text, list):
	text = [text]

	if images is not None:
	merge_length = self.image_processor.merge_size**2
	index = 0
	for i in range(len(text)):
	while self.image_token in text[i]:
	text[i] = text[i].replace(
	self.image_token,
	"<\|placeholder\|>" * (image_grid_thw[index].prod() // merge_length),
	1,
	)
	index += 1
	text[i] = text[i].replace("<\|placeholder\|>", self.image_token)

	if videos is not None:
	merge_length = self.video_processor.merge_size**2
	index = 0
	for i in range(len(text)):
	while self.video_token in text[i]:
	text[i] = text[i].replace(
	self.video_token,
	"<\|placeholder\|>" * (video_grid_thw[index].prod() // merge_length),
	1,
	)
	index += 1
	text[i] = text[i].replace("<\|placeholder\|>", self.video_token)

	# noise tokens
	noise_token_num = noise_token_num or self.robot_config.get("action_chunk_size")
	for i in range(len(text)):
	while self.action_token in text[i]:
	text[i] = text[i].replace(
	self.action_token,
	"<\|placeholder\|>" * noise_token_num,
	1,
	)
	text[i] = text[i].replace("<\|placeholder\|>", self.action_token)

	return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
	text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
	if return_mm_token_type_ids:
	array_ids = np.array(text_inputs["input_ids"])
	mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
	mm_token_type_ids[array_ids == self.image_token_id] = 1
	text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()

	# robot inputs
	robot_inputs = {}

	if states is not None:
	if isinstance(states, list):
	states = torch.stack(states, dim=0)
	if states.ndim == 1:
	states = states.unsqueeze(0)
	robot_inputs.update({"states": states})

	if actions is not None:
	if isinstance(actions, list):
	actions = torch.stack(actions, dim=0)
	if actions.ndim == 2:
	actions = actions.unsqueeze(0)
	robot_inputs.update({"actions": actions})

	return BatchFeature(
	data={text_inputs, image_inputs, videos_inputs, robot_inputs},
	)

	@property
	def model_input_names(self):
	tokenizer_input_names = self.tokenizer.model_input_names
	image_processor_input_names = self.image_processor.model_input_names
	names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
	return names_from_processor + ["second_per_grid_ts"] + ["states", "actions"]

	@torch.no_grad
	def _prepare_robot_inputs(self, batch: dict):
	"""Prepare model inputs from raw robot batch"""
	batch_messages = []
	batch_states = []
	max_state_dim = self.robot_config.get("max_state_dim", 32)

	state_keys = [x for x in batch.keys() if x.startswith(OBS_STATE)]
	batch_size = len(batch[state_keys[0]])

	if "repo_id" in batch:
	repo_ids = batch.pop("repo_id")
	else:
	print("no repo_id found, use the first one in normalize_inputs")
	repo_ids = list(self.normalize_inputs.keys())[0]
	repo_ids = [repo_ids] * batch_size if isinstance(repo_ids, str) else repo_ids

	for i, repo_id in enumerate(repo_ids):
	mini_batch = {k: v[i] for k, v in batch.items()}

	normalize_inputs = self.normalize_inputs[repo_id]
	select_video_keys = self.robot_config["select_video_keys"][repo_id]
	select_state_keys = self.robot_config["select_state_keys"][repo_id]

	for k in normalize_inputs.features:
	if not isinstance(mini_batch[k], torch.Tensor):
	mini_batch[k] = torch.tensor(mini_batch[k], dtype=torch.float32)

	mini_batch = normalize_inputs(mini_batch)
	states = torch.concat([mini_batch[k] for k in select_state_keys])
	batch_states.append(pad_vector(states, max_state_dim))
	messages = [
	{
	"role": "user",
	"content": [
	*({"type": "image", "image": mini_batch[k]} for k in select_video_keys),
	{"type": "state", "state": []}, # chat template state token
	{"type": "text", "text": f"{mini_batch['task']}{TASK_VLA_TOKEN}"},
	],
	}
	]
	batch_messages += [messages]
	return batch_messages, batch_states, repo_ids

	def _process_robot_outputs(self, repo_ids: list[str], actions: torch.Tensor):
	"""Process model outputs back to robot format"""
	output_actions = []
	for i, repo_id in enumerate(repo_ids):
	unnormalize_outputs = self.unnormalize_outputs[repo_id]
	select_action_keys = self.robot_config["select_action_keys"][repo_id]
	features = unnormalize_outputs.features
	cum_dims = [0] + np.cumsum([features[k].shape[0] for k in select_action_keys]).tolist()
	origin_action = actions[i].to(torch.float32)[..., : cum_dims[-1]]
	batch = {
	k: origin_action[..., cum_dims[m] : cum_dims[m + 1]] for m, k in enumerate(select_action_keys)
	}
	unnorm_actions = unnormalize_outputs(batch)
	unnorm_actions = torch.concat([unnorm_actions[k] for k in select_action_keys], -1)
	output_actions.append(unnorm_actions)
	output_actions = torch.stack(output_actions, dim=0)
	return output_actions

	@torch.no_grad
	def select_action(self, model, batch: dict, **kwargs):
	batch_messages, batch_states, repo_ids = self._prepare_robot_inputs(batch)

	noise_prompt = f"{ACTION_START_TOKEN}{DEFAULT_ACTION_TOKEN}{ACTION_END_TOKEN}"
	inputs = self.apply_chat_template(
	batch_messages,
	states=batch_states,
	add_generation_prompt=True,
	noise_prompt=noise_prompt,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	).to(model.device)

	actions = model.sample_actions(**inputs).cpu()
	output_actions = self._process_robot_outputs(repo_ids, actions)
	return BatchFeature({"action": output_actions})


	def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFeature]:
	"""Lerobot robot policy features"""
	policy_features = {}
	for key, ft in features.items():
	shape = ft["shape"]
	if ft["dtype"] in ["image", "video"]:
	type = FeatureType.VISUAL
	if len(shape) != 3:
	raise ValueError(f"Number of dimensions of {key} != 3 (shape={shape})")
	names = ft["names"]
	if names[2] in ["channel", "channels"]: # (h, w, c) -> (c, h, w)
	shape = (shape[2], shape[0], shape[1])
	elif key == "observation.environment_state":
	type = FeatureType.ENV
	elif key.startswith("observation"):
	type = FeatureType.STATE
	elif key.startswith("action"):
	type = FeatureType.ACTION
	else:
	continue
	policy_features[key] = PolicyFeature(
	type=type,
	shape=shape,
	)
	return policy_features


	def pad_vector(vector, new_dim=32):
	"""Can be (batch_size x sequence_length x features_dimension)
	or (batch_size x features_dimension)
	"""
	if vector.shape[-1] == new_dim:
	return vector
	shape = list(vector.shape)
	current_dim = shape[-1]
	shape[-1] = new_dim
	new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device)
	new_vector[..., :current_dim] = vector
	return new_vector


	EO1VisionProcessor.register_for_auto_class()