1
- from typing import Optional , Type
1
+ from typing import Any , Optional , Type
2
2
from warnings import warn
3
3
4
4
import torch
5
+ import torch .nn as nn
5
6
from captum .optim .models ._common import RedirectedReluLayer , SkipLayer
6
- from torch import nn
7
7
8
8
GS_SAVED_WEIGHTS_URL = (
9
9
"https://pytorch.s3.amazonaws.com/models/captum/clip_resnet50x4_image.pt"
@@ -14,7 +14,7 @@ def clip_resnet50x4_image(
14
14
pretrained : bool = False ,
15
15
progress : bool = True ,
16
16
model_path : Optional [str ] = None ,
17
- ** kwargs
17
+ ** kwargs : Any ,
18
18
) -> "CLIP_ResNet50x4Image" :
19
19
"""
20
20
The visual portion of OpenAI's ResNet 50x4 CLIP model from 'Learning Transferable
@@ -23,9 +23,8 @@ def clip_resnet50x4_image(
23
23
This model can be combined with the CLIP ResNet 50x4 Text model to create the full
24
24
CLIP ResNet 50x4 model.
25
25
26
- AvgPool2d layers were replaced with AdaptiveAvgPool2d to allow for any input height
27
- and width size, though the best results are obtained by using the model's intended
28
- input height and width of 288x288.
26
+ Note that model inputs are expected to have a shape of: [B, 3, 288, 288] or
27
+ [3, 288, 288].
29
28
30
29
See here for more details:
31
30
https://github.com/openai/CLIP
@@ -124,13 +123,13 @@ def __init__(
124
123
self .conv3 = nn .Conv2d (40 , 80 , kernel_size = 3 , padding = 1 , bias = False )
125
124
self .bn3 = nn .BatchNorm2d (80 )
126
125
self .relu3 = activ ()
127
- self .avgpool = nn .AdaptiveAvgPool2d ( 72 )
126
+ self .avgpool = nn .AvgPool2d ( 2 )
128
127
129
128
# Residual layers
130
- self .layer1 = self ._build_layer (80 , 80 , 4 , stride = 1 , pooling = 72 , activ = activ )
131
- self .layer2 = self ._build_layer (320 , 160 , 6 , stride = 2 , pooling = 36 , activ = activ )
132
- self .layer3 = self ._build_layer (640 , 320 , 10 , stride = 2 , pooling = 18 , activ = activ )
133
- self .layer4 = self ._build_layer (1280 , 640 , 6 , stride = 2 , pooling = 9 , activ = activ )
129
+ self .layer1 = self ._build_layer (80 , 80 , blocks = 4 , stride = 1 , activ = activ )
130
+ self .layer2 = self ._build_layer (320 , 160 , blocks = 6 , stride = 2 , activ = activ )
131
+ self .layer3 = self ._build_layer (640 , 320 , blocks = 10 , stride = 2 , activ = activ )
132
+ self .layer4 = self ._build_layer (1280 , 640 , blocks = 6 , stride = 2 , activ = activ )
134
133
135
134
# Attention Pooling
136
135
self .attnpool = AttentionPool2d (9 , 2560 , out_features = 640 , num_heads = 40 )
@@ -141,7 +140,6 @@ def _build_layer(
141
140
planes : int = 80 ,
142
141
blocks : int = 4 ,
143
142
stride : int = 1 ,
144
- pooling : int = 72 ,
145
143
activ : Type [nn .Module ] = nn .ReLU ,
146
144
) -> nn .Module :
147
145
"""
@@ -160,18 +158,16 @@ def _build_layer(
160
158
Default: 4
161
159
stride (int, optional): The stride value to use for the Bottleneck layers.
162
160
Default: 1
163
- pooling (int, optional): The output size used for nn.AdaptiveAvgPool2d.
164
- Default: 72
165
161
activ (type of nn.Module, optional): The nn.Module class type to use for
166
162
activation layers.
167
163
Default: nn.ReLU
168
164
169
165
Returns:
170
166
residual_layer (nn.Sequential): A full residual layer.
171
167
"""
172
- layers = [Bottleneck (inplanes , planes , stride , pooling = pooling , activ = activ )]
168
+ layers = [Bottleneck (inplanes , planes , stride , activ = activ )]
173
169
for _ in range (blocks - 1 ):
174
- layers += [Bottleneck (planes * 4 , planes , pooling = pooling , activ = activ )]
170
+ layers += [Bottleneck (planes * 4 , planes , activ = activ )]
175
171
return nn .Sequential (* layers )
176
172
177
173
def _transform_input (self , x : torch .Tensor ) -> torch .Tensor :
@@ -230,7 +226,6 @@ def __init__(
230
226
inplanes : int = 80 ,
231
227
planes : int = 80 ,
232
228
stride : int = 1 ,
233
- pooling : int = 72 ,
234
229
activ : Type [nn .Module ] = nn .ReLU ,
235
230
) -> None :
236
231
"""
@@ -244,8 +239,6 @@ def __init__(
244
239
Default: 80
245
240
stride (int, optional): The stride value to use for the Bottleneck layers.
246
241
Default: 1
247
- pooling (int, optional): The output size used for nn.AdaptiveAvgPool2d.
248
- Default: 72
249
242
activ (type of nn.Module, optional): The nn.Module class type to use for
250
243
activation layers.
251
244
Default: nn.ReLU
@@ -259,15 +252,15 @@ def __init__(
259
252
self .bn2 = nn .BatchNorm2d (planes )
260
253
self .relu2 = activ ()
261
254
262
- self .avgpool = nn .AdaptiveAvgPool2d ( pooling )
255
+ self .avgpool = nn .AvgPool2d ( stride ) if stride > 1 else nn . Identity ( )
263
256
264
257
self .conv3 = nn .Conv2d (planes , planes * 4 , kernel_size = 1 , bias = False )
265
258
self .bn3 = nn .BatchNorm2d (planes * 4 )
266
259
self .relu3 = activ ()
267
260
268
261
if stride > 1 or inplanes != planes * 4 :
269
262
self .downsample = nn .Sequential (
270
- nn .AdaptiveAvgPool2d ( pooling ),
263
+ nn .AvgPool2d ( stride ),
271
264
nn .Conv2d (inplanes , planes * 4 , kernel_size = 1 , stride = 1 , bias = False ),
272
265
nn .BatchNorm2d (planes * 4 ),
273
266
)
0 commit comments