@@ -78,7 +78,7 @@ def load_model(self, name: str) -> Tuple[nn.Module, ExampleType]:
7878 from transformers import CLIPVisionModel
7979
8080 config = AutoConfig .from_pretrained (name , torchscript = True )
81- model = CLIPVisionModel ._from_config (config )
81+ model = CLIPVisionModel ._from_config (config , attn_implementation = "eager" )
8282 preprocessor = CLIPFeatureExtractor .from_pretrained (name )
8383 encoded_input = preprocessor (self .image , return_tensors = "pt" )
8484 example = dict (encoded_input )
@@ -97,7 +97,7 @@ def load_model(self, name: str) -> Tuple[nn.Module, ExampleType]:
9797 from transformers import ViTImageProcessor
9898
9999 config = AutoConfig .from_pretrained (name , torchscript = True )
100- model = VisionEncoderDecoderModel ._from_config (config )
100+ model = VisionEncoderDecoderModel ._from_config (config , attn_implementation = "eager" )
101101 feature_extractor = ViTImageProcessor .from_pretrained (name )
102102 encoded_input = feature_extractor (images = [self .image ], return_tensors = "pt" )
103103
@@ -117,7 +117,7 @@ def forward(self, x):
117117 from transformers import Wav2Vec2ForSequenceClassification
118118
119119 config = AutoConfig .from_pretrained (name , torchscript = True )
120- model = Wav2Vec2ForSequenceClassification ._from_config (config )
120+ model = Wav2Vec2ForSequenceClassification ._from_config (config , attn_implementation = "eager" )
121121 processor = AutoFeatureExtractor .from_pretrained (name )
122122 input_values = processor (torch .randn (16000 ).numpy (), sampling_rate = 16_000 , return_tensors = "pt" )
123123 example = {"input_values" : input_values .input_values }
0 commit comments