{ "__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig", "class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D", "backbone": { "type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer", "embed_dims": 96, "depths": [ 2, 2, 6, 2 ], "num_heads": [ 3, 6, 12, 24 ], "window_size": 7, "mlp_ratio": 4, "qkv_bias": true, "qk_scale": null, "drop_rate": 0.0, "attn_drop_rate": 0.0, "out_indices": [ 1, 2, 3 ], "with_cp": true, "convert_weights": false }, "decoder": { "type": "robo_orchard_lab.models.sem_modules.action_decoder:SEMActionDecoder", "img_cross_attn": { "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention", "embed_dims": 256, "num_heads": 8, "max_position_embeddings": 32 }, "norm_layer": { "type": "torch.nn.modules.normalization:RMSNorm", "normalized_shape": 256 }, "ffn": { "type": "robo_orchard_lab.models.layers.transformer_layers:FFN", "embed_dims": 256, "feedforward_channels": 2048, "act_cfg": { "type": "torch.nn.modules.activation:SiLU", "inplace": true } }, "head": { "type": "robo_orchard_lab.models.sem_modules.layers:UpsampleHead", "upsample_sizes": [ 16, 32, 64 ], "input_dim": 256, "dims": [ 128, 64, 8 ], "norm": { "type": "torch.nn.modules.normalization:RMSNorm", "normalized_shape": 256 }, "act": { "type": "torch.nn.modules.activation:SiLU", "inplace": true }, "norm_act_idx": [ 0, 1, 2 ] }, "training_noise_scheduler": { "type": "diffusers.schedulers.scheduling_ddpm:DDPMScheduler", "num_train_timesteps": 1000, "beta_schedule": "squaredcos_cap_v2", "prediction_type": "sample", "clip_sample": false }, "test_noise_scheduler": { "type": "diffusers.schedulers.scheduling_dpmsolver_multistep:DPMSolverMultistepScheduler", "num_train_timesteps": 1000, "beta_schedule": "squaredcos_cap_v2", "prediction_type": "sample" }, "num_inference_timesteps": 10, "joint_self_attn": { "type": "robo_orchard_lab.models.sem_modules.layers:JointGraphAttention", "embed_dims": 256, "num_heads": 8 }, "temp_cross_attn": { "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention", "embed_dims": 256, "num_heads": 8, "max_position_embeddings": 32 }, "text_cross_attn": { "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention", "embed_dims": 256, "num_heads": 8, "max_position_embeddings": 256 }, "pred_steps": 64, "timestep_norm_layer": { "type": "robo_orchard_lab.models.sem_modules.layers:AdaRMSNorm", "normalized_shape": 256, "condition_dims": 256, "zero": true }, "operation_order": [ "t_norm", "joint_self_attn", "gate_msa", "norm", "temp_cross_attn", "norm", "img_cross_attn", "norm", null, null, "scale_shift", "ffn", "gate_mlp", "t_norm", "joint_self_attn", "gate_msa", "norm", "temp_cross_attn", "norm", "img_cross_attn", "norm", null, null, "scale_shift", "ffn", "gate_mlp", "t_norm", "joint_self_attn", "gate_msa", "norm", "temp_cross_attn", "norm", "img_cross_attn", "norm", null, null, "scale_shift", "ffn", "gate_mlp", "t_norm", "joint_self_attn", "gate_msa", "norm", "temp_cross_attn", "norm", "img_cross_attn", "norm", null, null, "scale_shift", "ffn", "gate_mlp", "t_norm", "joint_self_attn", "gate_msa", "norm", "temp_cross_attn", "norm", "img_cross_attn", "norm", null, null, "scale_shift", "ffn", "gate_mlp", "t_norm", "joint_self_attn", "gate_msa", "norm", "temp_cross_attn", "norm", "img_cross_attn", "norm", null, null, "scale_shift", "ffn", "gate_mlp" ], "feature_level": [ 1, 2 ], "act_cfg": { "type": "torch.nn.modules.activation:SiLU", "inplace": true }, "robot_encoder": { "type": "robo_orchard_lab.models.sem_modules.robot_state_encoder:SEMRobotStateEncoder", "embed_dims": 256, "chunk_size": 1, "joint_self_attn": { "type": "robo_orchard_lab.models.sem_modules.layers:JointGraphAttention", "embed_dims": 256, "num_heads": 8 }, "norm_layer": { "type": "torch.nn.modules.normalization:RMSNorm", "normalized_shape": 256 }, "ffn": { "type": "robo_orchard_lab.models.layers.transformer_layers:FFN", "embed_dims": 256, "feedforward_channels": 2048, "act_cfg": { "type": "torch.nn.modules.activation:SiLU", "inplace": true } }, "temp_self_attn": { "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention", "embed_dims": 256, "num_heads": 8, "max_position_embeddings": 32 }, "act_cfg": { "type": "torch.nn.modules.activation:SiLU", "inplace": true }, "operation_order": [ "norm", "joint_self_attn", null, null, "norm", "ffn", "norm", "joint_self_attn", null, null, "norm", "ffn", "norm", "joint_self_attn", null, null, "norm", "ffn", "norm", "joint_self_attn", null, null, "norm", "ffn", "norm" ], "state_dims": 8 }, "state_loss_weights": [ [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 2.0, 2.0, 2.0, 0.2, 0.2, 0.2, 0.2 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 2.0, 2.0, 2.0, 0.2, 0.2, 0.2, 0.2 ] ], "fk_loss_weight": [ [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 2.0, 2.0, 2.0, 0.2, 0.2, 0.2, 0.2 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1 ], [ 1.0, 2.0, 2.0, 2.0, 0.2, 0.2, 0.2, 0.2 ] ], "state_dims": 8 }, "neck": { "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper", "in_channels": [ 192, 384, 768 ], "kernel_size": 1, "out_channels": 256, "act_cfg": null, "bias": true, "norm_cfg": { "type": "torch.nn.modules.normalization:GroupNorm", "num_groups": 32 }, "num_outs": 3 }, "text_encoder": null, "feature_enhancer": null, "spatial_enhancer": { "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer", "embed_dims": 256, "feature_3d_dim": 32, "num_depth_layers": 2, "min_depth": 0.01, "max_depth": 1.2, "num_depth": 128, "with_feature_3d": true, "loss_depth_weight": 1.0 }, "data_preprocessor": { "type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor", "mean": [ 123.675, 116.28, 103.53 ], "std": [ 58.395, 57.12, 57.375 ], "channel_flip": false, "unsqueeze_depth_channel": true, "batch_transforms": [ { "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator", "min_depth": 0.01, "max_depth": 1.2, "num_depth": 128, "origin_stride": 2, "valid_threshold": 0.5, "stride": [ 8, 16, 32 ] } ] }, "backbone_3d": { "type": "robo_orchard_lab.models.modules.resnet:ResNet", "depth": 34, "in_channels": 1, "base_channels": 4, "num_stages": 4, "out_indices": [ 1, 2, 3 ], "bn_eval": true, "with_cp": true, "style": "pytorch" }, "neck_3d": { "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper", "in_channels": [ 8, 16, 32 ], "kernel_size": 1, "out_channels": 32, "act_cfg": null, "bias": true, "norm_cfg": { "type": "torch.nn.modules.normalization:GroupNorm", "num_groups": 4 }, "num_outs": 3 }, "input_2d": "imgs", "input_3d": "depths", "embed_dims": 256, "pre_spatial_enhancer": false }