@@ -63,8 +63,6 @@ def wrapper(*args, **kwargs):
63
63
"num_channels" : 3 ,
64
64
"point_batch_size" : 3 ,
65
65
"nb_points_per_image" : 2 ,
66
- "visual_seq_length" : 16 ,
67
- "visual_embedding_dim" : 20 ,
68
66
# audio
69
67
"feature_size" : 80 ,
70
68
"nb_max_frames" : 3000 ,
@@ -808,9 +806,6 @@ class DummyVisionInputGenerator(DummyInputGenerator):
808
806
"pixel_mask" ,
809
807
"sample" ,
810
808
"latent_sample" ,
811
- "visual_embeds" ,
812
- "visual_token_type_ids" ,
813
- "visual_attention_mask" ,
814
809
)
815
810
816
811
def __init__ (
@@ -821,8 +816,6 @@ def __init__(
821
816
num_channels : int = DEFAULT_DUMMY_SHAPES ["num_channels" ],
822
817
width : int = DEFAULT_DUMMY_SHAPES ["width" ],
823
818
height : int = DEFAULT_DUMMY_SHAPES ["height" ],
824
- visual_seq_length : int = DEFAULT_DUMMY_SHAPES ["visual_seq_length" ],
825
- visual_embedding_dim : int = DEFAULT_DUMMY_SHAPES ["visual_embedding_dim" ],
826
819
** kwargs ,
827
820
):
828
821
self .task = task
@@ -846,8 +839,6 @@ def __init__(
846
839
self .image_size = (self .image_size , self .image_size )
847
840
self .batch_size = batch_size
848
841
self .height , self .width = self .image_size
849
- self .visual_seq_length = visual_seq_length
850
- self .visual_embedding_dim = visual_embedding_dim
851
842
852
843
def generate (self , input_name : str , framework : str = "pt" , int_dtype : str = "int64" , float_dtype : str = "fp32" ):
853
844
if input_name == "pixel_mask" :
@@ -857,30 +848,6 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
857
848
framework = framework ,
858
849
dtype = int_dtype ,
859
850
)
860
-
861
- elif input_name in "visual_attention_mask" :
862
- return self .random_mask_tensor (
863
- shape = [self .batch_size , self .visual_seq_length ],
864
- padding_side = "right" ,
865
- framework = framework ,
866
- dtype = int_dtype ,
867
- )
868
-
869
- elif input_name == "visual_token_type_ids" :
870
- return self .random_int_tensor (
871
- shape = [self .batch_size , self .visual_seq_length ],
872
- max_value = 1 ,
873
- framework = framework ,
874
- dtype = int_dtype ,
875
- )
876
-
877
- elif input_name == "visual_embeds" :
878
- return self .random_float_tensor (
879
- shape = [self .batch_size , self .visual_seq_length , self .visual_embedding_dim ],
880
- framework = framework ,
881
- dtype = float_dtype ,
882
- )
883
-
884
851
else :
885
852
return self .random_float_tensor (
886
853
shape = [self .batch_size , self .num_channels , self .height , self .width ],
0 commit comments