Update 1.0.3 version and add ResNet-Vit

tczhangzhi · tczhangzhi · commit c3cf3ae1dcfa · 2021-02-16T11:11:26.000+08:00
diff --git a/README.md b/README.md
@@ -55,17 +55,17 @@ model = VisionTransformer.from_pretrained('ViT-B_16')
 
 Default hyper parameters:
 
-| Param\Model       | ViT-B_16 | ViT-B_32 | ViT-L_16 | ViT-L_32 |
-| ----------------- | -------- | -------- | -------- | -------- |
-| image_size        | 384      | 384      | 384      | 384      |
-| patch_size        | 16       | 32       | 16       | 32       |
-| emb_dim           | 768      | 768      | 1024     | 1024     |
-| mlp_dim           | 3072     | 3072     | 4096     | 4096     |
-| num_heads         | 12       | 12       | 16       | 16       |
-| num_layers        | 12       | 12       | 24       | 24       |
-| num_classes       | 1000     | 1000     | 1000     | 1000     |
-| attn_dropout_rate | 0.0      | 0.0      | 0.0      | 0.0      |
-| dropout_rate      | 0.1      | 0.1      | 0.1      | 0.1      |
+| Param\Model       | ViT-B_16 | ViT-B_32 | ViT-L_16 | ViT-L_32 | R50+ViT-B_16 |
+| ----------------- | -------- | -------- | -------- | -------- | ------------ |
+| image_size        | 384      | 384      | 384      | 384      | 384          |
+| patch_size        | 16       | 32       | 16       | 32       | 1            |
+| emb_dim           | 768      | 768      | 1024     | 1024     | 768          |
+| mlp_dim           | 3072     | 3072     | 4096     | 4096     | 3072         |
+| num_heads         | 12       | 12       | 16       | 16       | 12           |
+| num_layers        | 12       | 12       | 24       | 24       | 12           |
+| num_classes       | 1000     | 1000     | 1000     | 1000     | 1000         |
+| attn_dropout_rate | 0.0      | 0.0      | 0.0      | 0.0      | 0.0          |
+| dropout_rate      | 0.1      | 0.1      | 0.1      | 0.1      | 0.1          |
 
 If you need to modify these hyper parameters, please use:
 
diff --git a/jax_to_pytorch/convert_jax_to_pt/load_jax_weights.py b/jax_to_pytorch/convert_jax_to_pt/load_jax_weights.py
@@ -53,6 +53,18 @@ def replace_names(names):
             new_names.append('classifier')
         elif name == 'cls':
             new_names.append('cls_token')
+        elif name == 'block1':
+            new_names.append('resnet.body.block1')
+        elif name == 'block2':
+            new_names.append('resnet.body.block2')
+        elif name == 'block3':
+            new_names.append('resnet.body.block3')
+        elif name == 'conv_root':
+            new_names.append('resnet.root.conv')
+        elif name == 'gn_root':
+            new_names.append('resnet.root.gn')
+        elif name == 'conv_proj':
+            new_names.append('downsample')
         else:
             new_names.append(name)
     return new_names
@@ -81,9 +93,15 @@ def convert_jax_pytorch(keys, values):
             feat_dim, num_heads, head_dim = tensor_value.shape
             # for multi head attention q/k/v weight
             tensor_value = tensor_value
+        elif torch_names[-1] == 'weight' and 'gn' in torch_names[-2]:
+            # for multi head attention q/k/v weight
+            tensor_value = tensor_value.reshape(tensor_value.shape[-1])
         elif num_dim == 2 and torch_names[-1] == 'bias' and torch_names[-2] in ['query', 'key', 'value']:
             # for multi head attention q/k/v bias
             tensor_value = tensor_value
+        elif torch_names[-1] == 'bias' and 'gn' in torch_names[-2]:
+            # for multi head attention q/k/v weight
+            tensor_value = tensor_value.reshape(tensor_value.shape[-1])
         elif num_dim == 3 and torch_names[-1] == 'weight' and torch_names[-2] == 'out':
             # for multi head attention out weight
             tensor_value = tensor_value
diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
 EMAIL = 'zhangzhi2018@email.szu.edu.cn'
 AUTHOR = 'ZHANG Zhi'
 REQUIRES_PYTHON = '>=3.5.0'
-VERSION = '1.0.2'
+VERSION = '1.0.3'
 
 # What packages are required for this module to be executed?
 REQUIRED = [
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -12,7 +12,9 @@
 # -- fixtures -------------------------------------------------------------------------------------
 
 
-@pytest.fixture(scope='module', params=['ViT-B_16', 'ViT-B_32', 'ViT-L_16', 'ViT-L_32'])
+@pytest.fixture(
+    scope='module',
+    params=['ViT-B_16', 'ViT-B_32', 'ViT-L_16', 'ViT-L_32', 'R50+ViT-B_16'])
 def model(request):
     return request.param
 
@@ -24,7 +26,8 @@ def pretrained(request):
 
 @pytest.fixture(scope='function')
 def net(model, pretrained):
-    return VisionTransformer.from_pretrained(model) if pretrained else VisionTransformer.from_name(model)
+    return VisionTransformer.from_pretrained(
+        model) if pretrained else VisionTransformer.from_name(model)
 
 
 # -- tests ----------------------------------------------------------------------------------------
@@ -36,6 +39,7 @@ def test_forward(net):
     output = net(data)
     assert not torch.isnan(output).any()
 
+
 @pytest.mark.parametrize('img_size', [224, 256, 512])
 def test_hyper_params(model, img_size):
     """Test `.forward()` doesn't throw an error with different input size"""
diff --git a/vision_transformer_pytorch/__init__.py b/vision_transformer_pytorch/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.0.2"
+__version__ = "1.0.3"
 from .model import VisionTransformer, VALID_MODELS
 from .utils import (
     Params,
diff --git a/vision_transformer_pytorch/model.py b/vision_transformer_pytorch/model.py
@@ -7,15 +7,18 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from .utils import (get_width_and_height_from_size, load_pretrained_weights, get_model_params)
+from .resnet import StdConv2d
+from .utils import (get_width_and_height_from_size, load_pretrained_weights,
+                    get_model_params)
 
-VALID_MODELS = ('ViT-B_16', 'ViT-B_32', 'ViT-L_16', 'ViT-L_32')
+VALID_MODELS = ('ViT-B_16', 'ViT-B_32', 'ViT-L_16', 'ViT-L_32', 'R50+ViT-B_16')
 
 
 class PositionEmbs(nn.Module):
     def __init__(self, num_patches, emb_dim, dropout_rate=0.1):
         super(PositionEmbs, self).__init__()
-        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, emb_dim))
+        self.pos_embedding = nn.Parameter(
+            torch.randn(1, num_patches + 1, emb_dim))
         if dropout_rate > 0:
             self.dropout = nn.Dropout(dropout_rate)
         else:
@@ -109,11 +112,18 @@ def forward(self, x):
 
 
 class EncoderBlock(nn.Module):
-    def __init__(self, in_dim, mlp_dim, num_heads, dropout_rate=0.1, attn_dropout_rate=0.1):
+    def __init__(self,
+                 in_dim,
+                 mlp_dim,
+                 num_heads,
+                 dropout_rate=0.1,
+                 attn_dropout_rate=0.1):
         super(EncoderBlock, self).__init__()
 
         self.norm1 = nn.LayerNorm(in_dim)
-        self.attn = SelfAttention(in_dim, heads=num_heads, dropout_rate=attn_dropout_rate)
+        self.attn = SelfAttention(in_dim,
+                                  heads=num_heads,
+                                  dropout_rate=attn_dropout_rate)
         if dropout_rate > 0:
             self.dropout = nn.Dropout(dropout_rate)
         else:
@@ -154,7 +164,8 @@ def __init__(self,
         in_dim = emb_dim
         self.encoder_layers = nn.ModuleList()
         for i in range(num_layers):
-            layer = EncoderBlock(in_dim, mlp_dim, num_heads, dropout_rate, attn_dropout_rate)
+            layer = EncoderBlock(in_dim, mlp_dim, num_heads, dropout_rate,
+                                 attn_dropout_rate)
             self.encoder_layers.append(layer)
         self.norm = nn.LayerNorm(in_dim)
 
@@ -190,21 +201,33 @@ def __init__(self, params=None):
         super(VisionTransformer, self).__init__()
         self._params = params
 
-        self.embedding = nn.Conv2d(3, self._params.emb_dim, kernel_size=self.patch_size, stride=self.patch_size)
+        if self._params.resnet:
+            self.resnet = self._params.resnet()
+            self.embedding = nn.Conv2d(self.resnet.width * 16,
+                                       self._params.emb_dim,
+                                       kernel_size=1,
+                                       stride=1)
+        else:
+            self.embedding = nn.Conv2d(3,
+                                       self._params.emb_dim,
+                                       kernel_size=self.patch_size,
+                                       stride=self.patch_size)
         # class token
         self.cls_token = nn.Parameter(torch.zeros(1, 1, self._params.emb_dim))
 
         # transformer
-        self.transformer = Encoder(num_patches=self.num_patches,
-                                   emb_dim=self._params.emb_dim,
-                                   mlp_dim=self._params.mlp_dim,
-                                   num_layers=self._params.num_layers,
-                                   num_heads=self._params.num_heads,
-                                   dropout_rate=self._params.dropout_rate,
-                                   attn_dropout_rate=self._params.attn_dropout_rate)
+        self.transformer = Encoder(
+            num_patches=self.num_patches,
+            emb_dim=self._params.emb_dim,
+            mlp_dim=self._params.mlp_dim,
+            num_layers=self._params.num_layers,
+            num_heads=self._params.num_heads,
+            dropout_rate=self._params.dropout_rate,
+            attn_dropout_rate=self._params.attn_dropout_rate)
 
         # classfier
-        self.classifier = nn.Linear(self._params.emb_dim, self._params.num_classes)
+        self.classifier = nn.Linear(self._params.emb_dim,
+                                    self._params.num_classes)
 
     @property
     def image_size(self):
@@ -218,10 +241,16 @@ def patch_size(self):
     def num_patches(self):
         h, w = self.image_size
         fh, fw = self.patch_size
-        gh, gw = h // fh, w // fw
+        if hasattr(self, 'resnet'):
+            gh, gw = h // fh // self.resnet.downsample, w // fw // self.resnet.downsample
+        else:
+            gh, gw = h // fh, w // fw
         return gh * gw
 
     def extract_features(self, x):
+        if hasattr(self, 'resnet'):
+            x = self.resnet(x)
+
         emb = self.embedding(x)  # (n, c, gh, gw)
         emb = emb.permute(0, 2, 3, 1)  # (n, gh, hw, c)
         b, h, w, c = emb.shape
@@ -266,7 +295,12 @@ def from_name(cls, model_name, in_channels=3, **override_params):
         return model
 
     @classmethod
-    def from_pretrained(cls, model_name, weights_path=None, in_channels=3, num_classes=1000, **override_params):
+    def from_pretrained(cls,
+                        model_name,
+                        weights_path=None,
+                        in_channels=3,
+                        num_classes=1000,
+                        **override_params):
         """create an vision transformer model according to name.
         Args:
             model_name (str): Name for vision transformer.
@@ -288,8 +322,13 @@ def from_pretrained(cls, model_name, weights_path=None, in_channels=3, num_class
         Returns:
             A pretrained vision transformer model.
         """
-        model = cls.from_name(model_name, num_classes=num_classes, **override_params)
-        load_pretrained_weights(model, model_name, weights_path=weights_path, load_fc=(num_classes == 1000))
+        model = cls.from_name(model_name,
+                              num_classes=num_classes,
+                              **override_params)
+        load_pretrained_weights(model,
+                                model_name,
+                                weights_path=weights_path,
+                                load_fc=(num_classes == 1000))
         model._change_in_channels(in_channels)
         return model
 
@@ -302,15 +341,24 @@ def _check_model_name_is_valid(cls, model_name):
             bool: Is a valid name or not.
         """
         if model_name not in VALID_MODELS:
-            raise ValueError('model_name should be one of: ' + ', '.join(VALID_MODELS))
+            raise ValueError('model_name should be one of: ' +
+                             ', '.join(VALID_MODELS))
 
     def _change_in_channels(self, in_channels):
         """Adjust model's first convolution layer to in_channels, if in_channels not equals 3.
         Args:
             in_channels (int): Input data's channel number.
         """
         if in_channels != 3:
-            self.embedding = nn.Conv2d(in_channels,
-                                       self._params.emb_dim,
-                                       kernel_size=self.patch_size,
-                                       stride=self.patch_size)
+            if hasattr(self, 'resnet'):
+                self.resnet.root['conv'] = StdConv2d(in_channels,
+                                                     self.resnet.width,
+                                                     kernel_size=7,
+                                                     stride=2,
+                                                     bias=False,
+                                                     padding=3)
+            else:
+                self.embedding = nn.Conv2d(in_channels,
+                                           self._params.emb_dim,
+                                           kernel_size=self.patch_size,
+                                           stride=self.patch_size)
diff --git a/vision_transformer_pytorch/resnet.py b/vision_transformer_pytorch/resnet.py
diff --git a/vision_transformer_pytorch/utils.py b/vision_transformer_pytorch/utils.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "1.0.2"`
	`1`	`+__version__ = "1.0.3"`
`2`	`2`	`from .model import VisionTransformer, VALID_MODELS`
`3`	`3`	`from .utils import (`
`4`	`4`	`Params,`