From 5f243342999fe9f968949b6ca4bf2d3eb61db733 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyaow@umich.edu>
Date: Sun, 20 Dec 2020 12:11:13 +0800
Subject: [PATCH 01/10] modify model to EfficientNetAutoEncoder

---
 efficientnet_pytorch/model.py | 110 ++++++++++++++++++++++++++++------
 efficientnet_pytorch/utils.py |  71 ++++++++++++++++++++--
 2 files changed, 156 insertions(+), 25 deletions(-)
 mode change 100755 => 100644 efficientnet_pytorch/model.py
 mode change 100755 => 100644 efficientnet_pytorch/utils.py

diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py
old mode 100755
new mode 100644
index e89e97b5..8c776e4c
--- a/efficientnet_pytorch/model.py
+++ b/efficientnet_pytorch/model.py
@@ -40,6 +40,7 @@ class MBConvBlock(nn.Module):
         block_args (namedtuple): BlockArgs, defined in utils.py.
         global_params (namedtuple): GlobalParam, defined in utils.py.
         image_size (tuple or list): [image_height, image_width].
+        decoder_mode (bool): Reverse the block (deconvolution) if true.
 
     References:
         [1] https://arxiv.org/abs/1704.04861 (MobileNet v1)
@@ -47,19 +48,20 @@ class MBConvBlock(nn.Module):
         [3] https://arxiv.org/abs/1905.02244 (MobileNet v3)
     """
 
-    def __init__(self, block_args, global_params, image_size=None):
+    def __init__(self, block_args, global_params, image_size=None, decoder_mode=False):
         super().__init__()
         self._block_args = block_args
         self._bn_mom = 1 - global_params.batch_norm_momentum # pytorch's difference from tensorflow
         self._bn_eps = global_params.batch_norm_epsilon
         self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1)
         self.id_skip = block_args.id_skip  # whether to use skip connection and drop connect
+        self.decoder_mode = decoder_mode
 
         # Expansion phase (Inverted Bottleneck)
         inp = self._block_args.input_filters  # number of input channels
         oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
         if self._block_args.expand_ratio != 1:
-            Conv2d = get_same_padding_conv2d(image_size=image_size)
+            Conv2d = get_same_padding_conv2d(image_size=image_size, transposed=self.decoder_mode)
             self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
             self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
             # image_size = calculate_output_image_size(image_size, 1) <-- this wouldn't modify image_size
@@ -67,23 +69,23 @@ def __init__(self, block_args, global_params, image_size=None):
         # Depthwise convolution phase
         k = self._block_args.kernel_size
         s = self._block_args.stride
-        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        Conv2d = get_same_padding_conv2d(image_size=image_size, transposed=self.decoder_mode)
         self._depthwise_conv = Conv2d(
             in_channels=oup, out_channels=oup, groups=oup,  # groups makes it depthwise
             kernel_size=k, stride=s, bias=False)
         self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
-        image_size = calculate_output_image_size(image_size, s)
+        image_size = calculate_output_image_size(image_size, s, transposed=self.decoder_mode)
 
         # Squeeze and Excitation layer, if desired
         if self.has_se:
-            Conv2d = get_same_padding_conv2d(image_size=(1, 1))
+            Conv2d = get_same_padding_conv2d(image_size=(1, 1), transposed=self.decoder_mode)
             num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio))
             self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
             self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
 
         # Pointwise convolution phase
         final_oup = self._block_args.output_filters
-        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        Conv2d = get_same_padding_conv2d(image_size=image_size, transposed=self.decoder_mode)
         self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
         self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
         self._swish = MemoryEfficientSwish()
@@ -140,8 +142,8 @@ def set_swish(self, memory_efficient=True):
         self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
 
 
-class EfficientNet(nn.Module):
-    """EfficientNet model.
+class EfficientNetAutoEncoder(nn.Module):
+    """EfficientNet AutoEncoder model.
        Most easily loaded with the .from_name or .from_pretrained methods.
 
     Args:
@@ -173,10 +175,10 @@ def __init__(self, blocks_args=None, global_params=None):
         bn_mom = 1 - self._global_params.batch_norm_momentum
         bn_eps = self._global_params.batch_norm_epsilon
 
+        # ==== EfficientNet Encoder ====
         # Get stem static or dynamic convolution depending on image size
         image_size = global_params.image_size
         Conv2d = get_same_padding_conv2d(image_size=image_size)
-
         # Stem
         in_channels = 3  # rgb
         out_channels = round_filters(32, self._global_params)  # number of output channels
@@ -211,12 +213,51 @@ def __init__(self, blocks_args=None, global_params=None):
         self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
         self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
 
-        # Final linear layer
+        # ==== Linear layer for latent space ====
         self._avg_pooling = nn.AdaptiveAvgPool2d(1)
         self._dropout = nn.Dropout(self._global_params.dropout_rate)
         self._fc = nn.Linear(out_channels, self._global_params.num_classes)
         self._swish = MemoryEfficientSwish()
 
+        # ==== EfficientNet Decoder ====
+        # use dynamic image size for decoder
+        TransposedConv2d = get_same_padding_conv2d(image_size=image_size, transposed=True)
+
+        # Stem
+        # number of output channels from encoder model
+        in_channels, out_channels = out_channels, in_channels
+        # self._decoder_conv_stem symmetry to self._conv_head
+        self._decoder_conv_stem = TransposedConv2d(in_channels, out_channels, kernel_size=1, bias=False)
+        image_size = calculate_output_image_size(image_size, 1, transposed=True)
+        self._decoder_bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+        # image_size = calculate_output_image_size(image_size, 2)
+
+        # Build blocks
+        self._decoder_blocks = nn.ModuleList([])
+        for block_args in reversed(self._blocks_args):
+
+            # Update block input and output filters based on depth multiplier.
+            # NOTE: input/output are flip here to support deconvolution
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.output_filters, self._global_params),
+                output_filters=round_filters(block_args.input_filters, self._global_params),
+                num_repeat=round_repeats(block_args.num_repeat, self._global_params)
+            )
+            # The first block needs to take care of stride and filter size increase.
+            self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True))
+            image_size = calculate_output_image_size(image_size, block_args.stride, transposed=True)
+            if block_args.num_repeat > 1: # modify block_args to keep same output size
+                block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True))
+                # image_size = calculate_output_image_size(image_size, block_args.stride)  # stride = 1
+
+        # Head
+        in_channels = round_filters(32, self._global_params)  # number of output channels
+        out_channels = 3  # rgb
+        self._decoder_conv_head = TransposedConv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+        self._decoder_bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+
     def set_swish(self, memory_efficient=True):
         """Sets swish function as memory efficient (for training) or standard (for export).
 
@@ -272,6 +313,33 @@ def extract_endpoints(self, inputs):
 
         return endpoints
 
+    def decode_features(self, inputs):
+        """decoder portion of this autoencoder.
+
+        Args:
+            inputs (tensor): Input tensor to the decoder, 
+                             usually from self.extract_features
+
+        Returns:
+            Output of the final convolution
+            layer in the efficientnet model.
+        """
+        # Stem
+        x = self._swish(self._decoder_bn0(self._decoder_conv_stem(inputs)))
+        # Blocks
+        for idx, block in enumerate(self._decoder_blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                # scale drop connect_rate
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+
+        # Head
+        x = self._swish(self._decoder_bn1(self._decoder_conv_head(x)))
+
+        return x
+
+
     def extract_features(self, inputs):
         """use convolution layer to extract feature .
 
@@ -298,24 +366,28 @@ def extract_features(self, inputs):
         return x
 
     def forward(self, inputs):
-        """EfficientNet's forward function.
-           Calls extract_features to extract features, applies final linear layer, and returns logits.
+        """EfficientNet AutoEncoder's forward function.
+           Calls extract_features to extract features, 
+           then calls decode features to generates original inputs.
 
         Args:
             inputs (tensor): Input tensor.
 
         Returns:
-            Output of this model after processing.
+            (AE output tensor, latent representation tensor)
         """
         # Convolution layers
         x = self.extract_features(inputs)
+        
         # Pooling and final linear layer
-        x = self._avg_pooling(x)
-        if self._global_params.include_top:
-            x = x.flatten(start_dim=1)
-            x = self._dropout(x)
-            x = self._fc(x)
-        return x
+        latent_rep = self._avg_pooling(x)
+        latent_rep = latent_rep.flatten(start_dim=1)
+        latent_rep = self._dropout(latent_rep)
+        latent_rep = self._fc(latent_rep)
+        
+        # Deconvolution - decoder
+        x = self.decode_features(x)
+        return x, latent_rep
 
     @classmethod
     def from_name(cls, model_name, in_channels=3, **override_params):
diff --git a/efficientnet_pytorch/utils.py b/efficientnet_pytorch/utils.py
old mode 100755
new mode 100644
index 6a843458..2a95c9a5
--- a/efficientnet_pytorch/utils.py
+++ b/efficientnet_pytorch/utils.py
@@ -167,7 +167,7 @@ def get_width_and_height_from_size(x):
         raise TypeError()
 
 
-def calculate_output_image_size(input_image_size, stride):
+def calculate_output_image_size(input_image_size, stride, transposed=False):
     """Calculates the output image size when using Conv2dSamePadding with a stride.
        Necessary for static padding. Thanks to mannatsingh for pointing this out.
 
@@ -182,8 +182,12 @@ def calculate_output_image_size(input_image_size, stride):
         return None
     image_height, image_width = get_width_and_height_from_size(input_image_size)
     stride = stride if isinstance(stride, int) else stride[0]
-    image_height = int(math.ceil(image_height / stride))
-    image_width = int(math.ceil(image_width / stride))
+    if transposed:
+        image_height = int(image_height * stride)
+        image_width = int(image_width * stride)
+    else:
+        image_height = int(math.ceil(image_height / stride))
+        image_width = int(math.ceil(image_width / stride))
     return [image_height, image_width]
 
 
@@ -192,16 +196,20 @@ def calculate_output_image_size(input_image_size, stride):
 # Only when stride equals 1, can the output size be the same as input size.
 # Don't be confused by their function names ! ! !
 
-def get_same_padding_conv2d(image_size=None):
+def get_same_padding_conv2d(image_size=None, transposed=False):
     """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
        Static padding is necessary for ONNX exporting of models.
 
     Args:
         image_size (int or tuple): Size of the image.
+        transposed (bool): use nn.functional.conv_transpose2d if true, and nn.functional.conv2d otherwise.
 
     Returns:
         Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
     """
+    if transposed:
+        return TransposedConv2dDynamicSamePadding
+
     if image_size is None:
         return Conv2dDynamicSamePadding
     else:
@@ -271,6 +279,47 @@ def forward(self, x):
         x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
         return x
 
+class TransposedConv2dDynamicSamePadding(nn.ConvTranspose2d):
+    """2D Convolutions like TensorFlow, for a dynamic image size.
+       The padding is operated in forward function by calculating dynamically.
+    """
+
+    # Tips for 'SAME' mode padding.
+    #     Given the following:
+    #         i: width or height
+    #         s: stride
+    #         k: kernel size
+    #         d: dilation
+    #         p: padding
+    #         op: output padding
+    #     Output after ConvTranspose2d:
+    #         (i-1)*s + (k-1)*d + op + 1
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, output_padding=0, groups=1, bias=True, dilation=1):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0, output_padding, groups, bias, dilation)
+        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
+        self.output_padding = output_padding
+
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = ih * sh, iw * sw # change the output size according to stride ! ! !
+        # actual height/width after TransposedConv2d
+        actual_oh = (ih - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + self.output_padding + 1
+        actual_ow = (iw - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + self.output_padding + 1
+        crop_h = actual_oh - oh
+        crop_w = actual_ow - ow
+        assert crop_h >= 0 and crop_w >= 0
+        
+        x = F.conv_transpose2d(x, self.weight, self.bias, self.stride, self.padding,
+                                  self.output_padding, self.groups, self.dilation)
+        assert x.size()[-2:] == (actual_oh,  actual_ow)
+        if crop_h > 0 or crop_w > 0:
+            x = x[:, :, crop_h // 2 : - (crop_h - crop_h // 2), crop_w // 2 : - (crop_w - crop_w // 2)]
+
+        assert x.size()[-2:] == (oh, ow)
+        return x
 
 def get_same_padding_maxPool2d(image_size=None):
     """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
@@ -598,13 +647,23 @@ def load_pretrained_weights(model, model_name, weights_path=None, load_fc=True,
 
     if load_fc:
         ret = model.load_state_dict(state_dict, strict=False)
-        assert not ret.missing_keys, 'Missing keys when loading pretrained weights: {}'.format(ret.missing_keys)
+        
+        # weights for decoder are not loaded
+        # TODO: add initialization to missing layers
+        missing_keys = []
+        for key in ret.missing_keys:
+            if not key.startswith('_decoder'):
+                missing_keys.append(key)
+
+        assert not missing_keys, 'Missing keys when loading pretrained weights: {}'.format(
+            missing_keys)
     else:
         state_dict.pop('_fc.weight')
         state_dict.pop('_fc.bias')
         ret = model.load_state_dict(state_dict, strict=False)
         assert set(ret.missing_keys) == set(
             ['_fc.weight', '_fc.bias']), 'Missing keys when loading pretrained weights: {}'.format(ret.missing_keys)
-    assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format(ret.unexpected_keys)
+    assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format(
+        ret.unexpected_keys)
 
     print('Loaded pretrained weights for {}'.format(model_name))

From ff4dec4aafb36327cd6956db93c127c6b99e5683 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyaow@umich.edu>
Date: Sun, 20 Dec 2020 12:14:12 +0800
Subject: [PATCH 02/10] add comments

---
 efficientnet_pytorch/model.py | 4 ++--
 efficientnet_pytorch/utils.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py
index 8c776e4c..e5f50ad2 100644
--- a/efficientnet_pytorch/model.py
+++ b/efficientnet_pytorch/model.py
@@ -159,9 +159,9 @@ class EfficientNetAutoEncoder(nn.Module):
         import torch
         >>> from efficientnet.model import EfficientNet
         >>> inputs = torch.rand(1, 3, 224, 224)
-        >>> model = EfficientNet.from_pretrained('efficientnet-b0')
+        >>> model = EfficientNetAutoEncoder.from_pretrained('efficientnet-b0')
         >>> model.eval()
-        >>> outputs = model(inputs)
+        >>> ae_output, latent_fc_output = model(inputs)
     """
 
     def __init__(self, blocks_args=None, global_params=None):
diff --git a/efficientnet_pytorch/utils.py b/efficientnet_pytorch/utils.py
index 2a95c9a5..7871a875 100644
--- a/efficientnet_pytorch/utils.py
+++ b/efficientnet_pytorch/utils.py
@@ -654,6 +654,7 @@ def load_pretrained_weights(model, model_name, weights_path=None, load_fc=True,
         for key in ret.missing_keys:
             if not key.startswith('_decoder'):
                 missing_keys.append(key)
+        print(f"Weights for _decoder keys are not loaded.")
 
         assert not missing_keys, 'Missing keys when loading pretrained weights: {}'.format(
             missing_keys)

From 4d64d871d550b35dc4eb508a174b6a114722b9b1 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyaow@umich.edu>
Date: Sun, 20 Dec 2020 12:22:51 +0800
Subject: [PATCH 03/10] make compatible with original EfficientNet

---
 efficientnet_pytorch/model.py | 220 ++++++++++++++++++++--------------
 efficientnet_pytorch/utils.py |   1 -
 2 files changed, 132 insertions(+), 89 deletions(-)

diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py
index e5f50ad2..b9ea59c9 100644
--- a/efficientnet_pytorch/model.py
+++ b/efficientnet_pytorch/model.py
@@ -142,8 +142,8 @@ def set_swish(self, memory_efficient=True):
         self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
 
 
-class EfficientNetAutoEncoder(nn.Module):
-    """EfficientNet AutoEncoder model.
+class EfficientNet(nn.Module):
+    """EfficientNet model.
        Most easily loaded with the .from_name or .from_pretrained methods.
 
     Args:
@@ -154,14 +154,12 @@ class EfficientNetAutoEncoder(nn.Module):
         [1] https://arxiv.org/abs/1905.11946 (EfficientNet)
 
     Example:
-        
-        
-        import torch
+        >>> import torch
         >>> from efficientnet.model import EfficientNet
         >>> inputs = torch.rand(1, 3, 224, 224)
-        >>> model = EfficientNetAutoEncoder.from_pretrained('efficientnet-b0')
+        >>> model = EfficientNet.from_pretrained('efficientnet-b0')
         >>> model.eval()
-        >>> ae_output, latent_fc_output = model(inputs)
+        >>> outputs = model(inputs)
     """
 
     def __init__(self, blocks_args=None, global_params=None):
@@ -175,10 +173,10 @@ def __init__(self, blocks_args=None, global_params=None):
         bn_mom = 1 - self._global_params.batch_norm_momentum
         bn_eps = self._global_params.batch_norm_epsilon
 
-        # ==== EfficientNet Encoder ====
         # Get stem static or dynamic convolution depending on image size
         image_size = global_params.image_size
         Conv2d = get_same_padding_conv2d(image_size=image_size)
+
         # Stem
         in_channels = 3  # rgb
         out_channels = round_filters(32, self._global_params)  # number of output channels
@@ -213,51 +211,12 @@ def __init__(self, blocks_args=None, global_params=None):
         self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
         self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
 
-        # ==== Linear layer for latent space ====
+        # Final linear layer
         self._avg_pooling = nn.AdaptiveAvgPool2d(1)
         self._dropout = nn.Dropout(self._global_params.dropout_rate)
         self._fc = nn.Linear(out_channels, self._global_params.num_classes)
         self._swish = MemoryEfficientSwish()
 
-        # ==== EfficientNet Decoder ====
-        # use dynamic image size for decoder
-        TransposedConv2d = get_same_padding_conv2d(image_size=image_size, transposed=True)
-
-        # Stem
-        # number of output channels from encoder model
-        in_channels, out_channels = out_channels, in_channels
-        # self._decoder_conv_stem symmetry to self._conv_head
-        self._decoder_conv_stem = TransposedConv2d(in_channels, out_channels, kernel_size=1, bias=False)
-        image_size = calculate_output_image_size(image_size, 1, transposed=True)
-        self._decoder_bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
-        # image_size = calculate_output_image_size(image_size, 2)
-
-        # Build blocks
-        self._decoder_blocks = nn.ModuleList([])
-        for block_args in reversed(self._blocks_args):
-
-            # Update block input and output filters based on depth multiplier.
-            # NOTE: input/output are flip here to support deconvolution
-            block_args = block_args._replace(
-                input_filters=round_filters(block_args.output_filters, self._global_params),
-                output_filters=round_filters(block_args.input_filters, self._global_params),
-                num_repeat=round_repeats(block_args.num_repeat, self._global_params)
-            )
-            # The first block needs to take care of stride and filter size increase.
-            self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True))
-            image_size = calculate_output_image_size(image_size, block_args.stride, transposed=True)
-            if block_args.num_repeat > 1: # modify block_args to keep same output size
-                block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
-            for _ in range(block_args.num_repeat - 1):
-                self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True))
-                # image_size = calculate_output_image_size(image_size, block_args.stride)  # stride = 1
-
-        # Head
-        in_channels = round_filters(32, self._global_params)  # number of output channels
-        out_channels = 3  # rgb
-        self._decoder_conv_head = TransposedConv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
-        self._decoder_bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
-
     def set_swish(self, memory_efficient=True):
         """Sets swish function as memory efficient (for training) or standard (for export).
 
@@ -313,33 +272,6 @@ def extract_endpoints(self, inputs):
 
         return endpoints
 
-    def decode_features(self, inputs):
-        """decoder portion of this autoencoder.
-
-        Args:
-            inputs (tensor): Input tensor to the decoder, 
-                             usually from self.extract_features
-
-        Returns:
-            Output of the final convolution
-            layer in the efficientnet model.
-        """
-        # Stem
-        x = self._swish(self._decoder_bn0(self._decoder_conv_stem(inputs)))
-        # Blocks
-        for idx, block in enumerate(self._decoder_blocks):
-            drop_connect_rate = self._global_params.drop_connect_rate
-            if drop_connect_rate:
-                # scale drop connect_rate
-                drop_connect_rate *= float(idx) / len(self._blocks)
-            x = block(x, drop_connect_rate=drop_connect_rate)
-
-        # Head
-        x = self._swish(self._decoder_bn1(self._decoder_conv_head(x)))
-
-        return x
-
-
     def extract_features(self, inputs):
         """use convolution layer to extract feature .
 
@@ -366,28 +298,25 @@ def extract_features(self, inputs):
         return x
 
     def forward(self, inputs):
-        """EfficientNet AutoEncoder's forward function.
-           Calls extract_features to extract features, 
-           then calls decode features to generates original inputs.
+        """EfficientNet's forward function.
+           Calls extract_features to extract features, applies final linear layer, and returns logits.
 
         Args:
             inputs (tensor): Input tensor.
 
         Returns:
-            (AE output tensor, latent representation tensor)
+            Output of this model after processing.
         """
         # Convolution layers
         x = self.extract_features(inputs)
-        
+
         # Pooling and final linear layer
-        latent_rep = self._avg_pooling(x)
-        latent_rep = latent_rep.flatten(start_dim=1)
-        latent_rep = self._dropout(latent_rep)
-        latent_rep = self._fc(latent_rep)
-        
-        # Deconvolution - decoder
-        x = self.decode_features(x)
-        return x, latent_rep
+        x = self._avg_pooling(x)
+        x = x.flatten(start_dim=1)
+        x = self._dropout(x)
+        x = self._fc(x)
+
+        return x
 
     @classmethod
     def from_name(cls, model_name, in_channels=3, **override_params):
@@ -485,3 +414,118 @@ def _change_in_channels(self, in_channels):
             Conv2d = get_same_padding_conv2d(image_size=self._global_params.image_size)
             out_channels = round_filters(32, self._global_params)
             self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+
+class EfficientNetAutoEncoder(EfficientNet):
+    """EfficientNet AutoEncoder model.
+       Most easily loaded with the .from_name or .from_pretrained methods.
+
+    Args:
+        blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks.
+        global_params (namedtuple): A set of GlobalParams shared between blocks.
+
+    References:
+        [1] https://arxiv.org/abs/1905.11946 (EfficientNet)
+
+    Example:
+        
+        
+        import torch
+        >>> from efficientnet.model import EfficientNet
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> model = EfficientNetAutoEncoder.from_pretrained('efficientnet-b0')
+        >>> model.eval()
+        >>> ae_output, latent_fc_output = model(inputs)
+    """
+
+    def __init__(self, blocks_args=None, global_params=None):
+        super().__init__(blocks_args=blocks_args, global_params=global_params)
+        
+        # EfficientNet Decoder
+        # use dynamic image size for decoder
+        TransposedConv2d = get_same_padding_conv2d(image_size=image_size, transposed=True)
+
+        # Stem
+        # number of output channels from encoder model
+        in_channels, out_channels = out_channels, in_channels
+        # self._decoder_conv_stem symmetry to self._conv_head
+        self._decoder_conv_stem = TransposedConv2d(in_channels, out_channels, kernel_size=1, bias=False)
+        image_size = calculate_output_image_size(image_size, 1, transposed=True)
+        self._decoder_bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+        # image_size = calculate_output_image_size(image_size, 2)
+
+        # Build blocks
+        self._decoder_blocks = nn.ModuleList([])
+        for block_args in reversed(self._blocks_args):
+
+            # Update block input and output filters based on depth multiplier.
+            # NOTE: input/output are flip here to support deconvolution
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.output_filters, self._global_params),
+                output_filters=round_filters(block_args.input_filters, self._global_params),
+                num_repeat=round_repeats(block_args.num_repeat, self._global_params)
+            )
+            # The first block needs to take care of stride and filter size increase.
+            self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True))
+            image_size = calculate_output_image_size(image_size, block_args.stride, transposed=True)
+            if block_args.num_repeat > 1: # modify block_args to keep same output size
+                block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True))
+                # image_size = calculate_output_image_size(image_size, block_args.stride)  # stride = 1
+
+        # Head
+        in_channels = round_filters(32, self._global_params)  # number of output channels
+        out_channels = 3  # rgb
+        self._decoder_conv_head = TransposedConv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+        self._decoder_bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+
+    def decode_features(self, inputs):
+        """decoder portion of this autoencoder.
+
+        Args:
+            inputs (tensor): Input tensor to the decoder, 
+                             usually from self.extract_features
+
+        Returns:
+            Output of the final convolution
+            layer in the efficientnet model.
+        """
+        # Stem
+        x = self._swish(self._decoder_bn0(self._decoder_conv_stem(inputs)))
+        # Blocks
+        for idx, block in enumerate(self._decoder_blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                # scale drop connect_rate
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+
+        # Head
+        x = self._swish(self._decoder_bn1(self._decoder_conv_head(x)))
+
+        return x
+
+
+    def forward(self, inputs):
+        """EfficientNet AutoEncoder's forward function.
+           Calls extract_features to extract features, 
+           then calls decode features to generates original inputs.
+
+        Args:
+            inputs (tensor): Input tensor.
+
+        Returns:
+            (AE output tensor, latent representation tensor)
+        """
+        # Convolution layers
+        x = self.extract_features(inputs)
+        
+        # Pooling and final linear layer
+        latent_rep = self._avg_pooling(x)
+        latent_rep = latent_rep.flatten(start_dim=1)
+        latent_rep = self._dropout(latent_rep)
+        latent_rep = self._fc(latent_rep)
+        
+        # Deconvolution - decoder
+        x = self.decode_features(x)
+        return x, latent_rep
diff --git a/efficientnet_pytorch/utils.py b/efficientnet_pytorch/utils.py
index 7871a875..2a95c9a5 100644
--- a/efficientnet_pytorch/utils.py
+++ b/efficientnet_pytorch/utils.py
@@ -654,7 +654,6 @@ def load_pretrained_weights(model, model_name, weights_path=None, load_fc=True,
         for key in ret.missing_keys:
             if not key.startswith('_decoder'):
                 missing_keys.append(key)
-        print(f"Weights for _decoder keys are not loaded.")
 
         assert not missing_keys, 'Missing keys when loading pretrained weights: {}'.format(
             missing_keys)

From b975d38de9a189964619c3cddbdf1f5ba1f79165 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyaow@umich.edu>
Date: Sun, 20 Dec 2020 13:09:29 +0800
Subject: [PATCH 04/10] revert forward func of EfficientNet

---
 efficientnet_pytorch/model.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py
index b9ea59c9..79e7da7f 100644
--- a/efficientnet_pytorch/model.py
+++ b/efficientnet_pytorch/model.py
@@ -312,10 +312,10 @@ def forward(self, inputs):
 
         # Pooling and final linear layer
         x = self._avg_pooling(x)
-        x = x.flatten(start_dim=1)
-        x = self._dropout(x)
-        x = self._fc(x)
-
+        if self._global_params.include_top:
+            x = x.flatten(start_dim=1)
+            x = self._dropout(x)
+            x = self._fc(x)
         return x
 
     @classmethod

From e8648d83bfb1dfd0d5e54545ab5f5841f9a77816 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyaow@umich.edu>
Date: Sun, 20 Dec 2020 13:11:22 +0800
Subject: [PATCH 05/10] fix indentation of EfficientNet

---
 efficientnet_pytorch/model.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py
index 79e7da7f..a8db4ef5 100644
--- a/efficientnet_pytorch/model.py
+++ b/efficientnet_pytorch/model.py
@@ -239,7 +239,9 @@ def extract_endpoints(self, inputs):
             Dictionary of last intermediate features
             with reduction levels i in [1, 2, 3, 4, 5].
             Example:
-                >>> import torch
+                    
+        
+                import torch
                 >>> from efficientnet.model import EfficientNet
                 >>> inputs = torch.rand(1, 3, 224, 224)
                 >>> model = EfficientNet.from_pretrained('efficientnet-b0')

From b34fb73d67739a61b5eff76c54b33da3531a30f7 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyaow@umich.edu>
Date: Sun, 20 Dec 2020 23:03:30 +0800
Subject: [PATCH 06/10] fixed variable

---
 efficientnet_pytorch/model.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py
index a8db4ef5..c99c5c9a 100644
--- a/efficientnet_pytorch/model.py
+++ b/efficientnet_pytorch/model.py
@@ -170,8 +170,8 @@ def __init__(self, blocks_args=None, global_params=None):
         self._blocks_args = blocks_args
 
         # Batch norm parameters
-        bn_mom = 1 - self._global_params.batch_norm_momentum
-        bn_eps = self._global_params.batch_norm_epsilon
+        self._bn_mom = bn_mom = 1 - self._global_params.batch_norm_momentum
+        self._bn_eps = bn_eps = self._global_params.batch_norm_epsilon
 
         # Get stem static or dynamic convolution depending on image size
         image_size = global_params.image_size
@@ -217,6 +217,8 @@ def __init__(self, blocks_args=None, global_params=None):
         self._fc = nn.Linear(out_channels, self._global_params.num_classes)
         self._swish = MemoryEfficientSwish()
 
+        self._image_size = image_size
+
     def set_swish(self, memory_efficient=True):
         """Sets swish function as memory efficient (for training) or standard (for export).
 
@@ -241,7 +243,7 @@ def extract_endpoints(self, inputs):
             Example:
                     
         
-                import torch
+                >>> import torch
                 >>> from efficientnet.model import EfficientNet
                 >>> inputs = torch.rand(1, 3, 224, 224)
                 >>> model = EfficientNet.from_pretrained('efficientnet-b0')
@@ -431,7 +433,7 @@ class EfficientNetAutoEncoder(EfficientNet):
     Example:
         
         
-        import torch
+        >>> import torch
         >>> from efficientnet.model import EfficientNet
         >>> inputs = torch.rand(1, 3, 224, 224)
         >>> model = EfficientNetAutoEncoder.from_pretrained('efficientnet-b0')
@@ -441,15 +443,18 @@ class EfficientNetAutoEncoder(EfficientNet):
 
     def __init__(self, blocks_args=None, global_params=None):
         super().__init__(blocks_args=blocks_args, global_params=global_params)
-        
+        bn_mom = self._bn_mon
+        bn_eps = self._bn_eps
+        image_size = self._image_size
+
         # EfficientNet Decoder
         # use dynamic image size for decoder
         TransposedConv2d = get_same_padding_conv2d(image_size=image_size, transposed=True)
 
         # Stem
-        # number of output channels from encoder model
-        in_channels, out_channels = out_channels, in_channels
         # self._decoder_conv_stem symmetry to self._conv_head
+        in_channels = round_filters(1280, self._global_params)
+        out_channels = block_args.output_filters  # output of final block
         self._decoder_conv_stem = TransposedConv2d(in_channels, out_channels, kernel_size=1, bias=False)
         image_size = calculate_output_image_size(image_size, 1, transposed=True)
         self._decoder_bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
@@ -460,7 +465,7 @@ def __init__(self, blocks_args=None, global_params=None):
         for block_args in reversed(self._blocks_args):
 
             # Update block input and output filters based on depth multiplier.
-            # NOTE: input/output are flip here to support deconvolution
+            # input/output are flip here to support deconvolution
             block_args = block_args._replace(
                 input_filters=round_filters(block_args.output_filters, self._global_params),
                 output_filters=round_filters(block_args.input_filters, self._global_params),

From 52067bd550c12faec6b4049fa79cf52c4dd79334 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyaow@umich.edu>
Date: Sun, 20 Dec 2020 23:11:38 +0800
Subject: [PATCH 07/10] fixed variable

---
 efficientnet_pytorch/model.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py
index c99c5c9a..3bca7b7b 100644
--- a/efficientnet_pytorch/model.py
+++ b/efficientnet_pytorch/model.py
@@ -218,6 +218,7 @@ def __init__(self, blocks_args=None, global_params=None):
         self._swish = MemoryEfficientSwish()
 
         self._image_size = image_size
+        self._last_block_args = block_args
 
     def set_swish(self, memory_efficient=True):
         """Sets swish function as memory efficient (for training) or standard (for export).
@@ -443,9 +444,10 @@ class EfficientNetAutoEncoder(EfficientNet):
 
     def __init__(self, blocks_args=None, global_params=None):
         super().__init__(blocks_args=blocks_args, global_params=global_params)
-        bn_mom = self._bn_mon
+        bn_mom = self._bn_mom
         bn_eps = self._bn_eps
         image_size = self._image_size
+        block_args = self._last_block_args
 
         # EfficientNet Decoder
         # use dynamic image size for decoder

From f974bad36d8529eebcdb5af01263b080d29cdd54 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyaow@umich.edu>
Date: Sun, 20 Dec 2020 23:17:15 +0800
Subject: [PATCH 08/10] modify comments

---
 efficientnet_pytorch/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py
index 3bca7b7b..870dfd64 100644
--- a/efficientnet_pytorch/model.py
+++ b/efficientnet_pytorch/model.py
@@ -518,7 +518,7 @@ def decode_features(self, inputs):
     def forward(self, inputs):
         """EfficientNet AutoEncoder's forward function.
            Calls extract_features to extract features, 
-           then calls decode features to generates original inputs.
+           then calls decode features to calculate AE output.
 
         Args:
             inputs (tensor): Input tensor.

From 16e0633386c6190af2884b0125c35a4fb306256b Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyaow@umich.edu>
Date: Mon, 28 Dec 2020 23:41:45 +0800
Subject: [PATCH 09/10] add TransposedConv2dStaticSamePadding to fix
 TransposedConv2d to odd image size issue; add latent feature by
 down/upsampling between encoder and decoder;

---
 efficientnet_pytorch/model.py | 81 +++++++++++++++++++++++++++--------
 efficientnet_pytorch/utils.py | 38 ++++++++++------
 2 files changed, 88 insertions(+), 31 deletions(-)

diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py
index 870dfd64..2dfe476e 100644
--- a/efficientnet_pytorch/model.py
+++ b/efficientnet_pytorch/model.py
@@ -48,7 +48,7 @@ class MBConvBlock(nn.Module):
         [3] https://arxiv.org/abs/1905.02244 (MobileNet v3)
     """
 
-    def __init__(self, block_args, global_params, image_size=None, decoder_mode=False):
+    def __init__(self, block_args, global_params, image_size=None, decoder_mode=False, decoder_output_image_size=None):
         super().__init__()
         self._block_args = block_args
         self._bn_mom = 1 - global_params.batch_norm_momentum # pytorch's difference from tensorflow
@@ -69,12 +69,16 @@ def __init__(self, block_args, global_params, image_size=None, decoder_mode=Fals
         # Depthwise convolution phase
         k = self._block_args.kernel_size
         s = self._block_args.stride
+        if self.decoder_mode:
+            # assert decoder_output_image_size
+            image_size = decoder_output_image_size
         Conv2d = get_same_padding_conv2d(image_size=image_size, transposed=self.decoder_mode)
         self._depthwise_conv = Conv2d(
             in_channels=oup, out_channels=oup, groups=oup,  # groups makes it depthwise
             kernel_size=k, stride=s, bias=False)
         self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
-        image_size = calculate_output_image_size(image_size, s, transposed=self.decoder_mode)
+        if not self.decoder_mode:
+            image_size = calculate_output_image_size(image_size, s)
 
         # Squeeze and Excitation layer, if desired
         if self.has_se:
@@ -186,6 +190,7 @@ def __init__(self, blocks_args=None, global_params=None):
 
         # Build blocks
         self._blocks = nn.ModuleList([])
+        self._blocks_image_size = [image_size]
         for block_args in self._blocks_args:
 
             # Update block input and output filters based on depth multiplier.
@@ -198,6 +203,7 @@ def __init__(self, blocks_args=None, global_params=None):
             # The first block needs to take care of stride and filter size increase.
             self._blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size))
             image_size = calculate_output_image_size(image_size, block_args.stride)
+            self._blocks_image_size.append(image_size)
             if block_args.num_repeat > 1: # modify block_args to keep same output size
                 block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
             for _ in range(block_args.num_repeat - 1):
@@ -219,6 +225,7 @@ def __init__(self, blocks_args=None, global_params=None):
 
         self._image_size = image_size
         self._last_block_args = block_args
+        self._last_out_channels = out_channels
 
     def set_swish(self, memory_efficient=True):
         """Sets swish function as memory efficient (for training) or standard (for export).
@@ -289,17 +296,18 @@ def extract_features(self, inputs):
         """
         # Stem
         x = self._swish(self._bn0(self._conv_stem(inputs)))
-
+        # print(f"after conv_stem: {x.size()}")
         # Blocks
         for idx, block in enumerate(self._blocks):
             drop_connect_rate = self._global_params.drop_connect_rate
             if drop_connect_rate:
                 drop_connect_rate *= float(idx) / len(self._blocks) # scale drop connect_rate
             x = block(x, drop_connect_rate=drop_connect_rate)
+            # print(f"after block: {x.size()}")
 
         # Head
         x = self._swish(self._bn1(self._conv_head(x)))
-
+        # print(f"after conv_head: {x.size()}")
         return x
 
     def forward(self, inputs):
@@ -448,6 +456,13 @@ def __init__(self, blocks_args=None, global_params=None):
         bn_eps = self._bn_eps
         image_size = self._image_size
         block_args = self._last_block_args
+        
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._feature_downsample = Conv2d(self._last_out_channels, 8, kernel_size=1, bias=False)
+        self._downsample_bn = nn.BatchNorm2d(num_features=8, momentum=bn_mom, eps=bn_eps)
+        self._feature_upsample = Conv2d(8, self._last_out_channels, kernel_size=1, bias=False)
+        self._upsample_bn = nn.BatchNorm2d(num_features=self._last_out_channels, momentum=bn_mom, eps=bn_eps)
+        self.feature_size = 8 * image_size[0]**2
 
         # EfficientNet Decoder
         # use dynamic image size for decoder
@@ -464,8 +479,12 @@ def __init__(self, blocks_args=None, global_params=None):
 
         # Build blocks
         self._decoder_blocks = nn.ModuleList([])
-        for block_args in reversed(self._blocks_args):
-
+        # print(f"foward size:\n{self._blocks_image_size}")
+        assert len(self._blocks_image_size) == len(self._blocks_args) + 1
+        self._blocks_image_size = list(reversed(self._blocks_image_size))
+        # print(f"backward size:\n{self._blocks_image_size}")
+        for i, block_args in enumerate(reversed(self._blocks_args)):
+            image_size = self._blocks_image_size[i]
             # Update block input and output filters based on depth multiplier.
             # input/output are flip here to support deconvolution
             block_args = block_args._replace(
@@ -473,21 +492,42 @@ def __init__(self, blocks_args=None, global_params=None):
                 output_filters=round_filters(block_args.input_filters, self._global_params),
                 num_repeat=round_repeats(block_args.num_repeat, self._global_params)
             )
+            # print(f"input filter: {block_args.input_filters}, output filter: {block_args.output_filters}")
             # The first block needs to take care of stride and filter size increase.
-            self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True))
-            image_size = calculate_output_image_size(image_size, block_args.stride, transposed=True)
+            self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size,
+                                                    decoder_mode=True, decoder_output_image_size=self._blocks_image_size[i+1]))
+            image_size = self._blocks_image_size[i+1]
             if block_args.num_repeat > 1: # modify block_args to keep same output size
                 block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
             for _ in range(block_args.num_repeat - 1):
-                self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True))
+                self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size,
+                                                        decoder_mode=True, decoder_output_image_size=image_size))
                 # image_size = calculate_output_image_size(image_size, block_args.stride)  # stride = 1
 
         # Head
         in_channels = round_filters(32, self._global_params)  # number of output channels
         out_channels = 3  # rgb
+        TransposedConv2d = get_same_padding_conv2d(image_size=global_params.image_size, transposed=True)
         self._decoder_conv_head = TransposedConv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
         self._decoder_bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
 
+    def extract_features(self, inputs):
+        """use convolution layer to extract feature,
+        with additional down-sample layer to get 1280 hidden feature.
+
+        Args:
+            inputs (tensor): Input tensor.
+
+        Returns:
+            Output of the final convolution
+            layer in the efficientnet model.
+        """
+        x = super().extract_features(inputs)
+        # print(f"before downsample size: {x.size()}")
+        x = self._swish(self._downsample_bn(self._feature_downsample(x)))
+        return x
+
+
     def decode_features(self, inputs):
         """decoder portion of this autoencoder.
 
@@ -499,8 +539,12 @@ def decode_features(self, inputs):
             Output of the final convolution
             layer in the efficientnet model.
         """
+        # upsample
+        x = self._swish(self._upsample_bn(self._feature_upsample(inputs)))
+        # print(f"after upsample size: {x.size()}")
         # Stem
-        x = self._swish(self._decoder_bn0(self._decoder_conv_stem(inputs)))
+        x = self._swish(self._decoder_bn0(self._decoder_conv_stem(x)))
+        # print(f"after decoder_conv_stem: {x.size()}")
         # Blocks
         for idx, block in enumerate(self._decoder_blocks):
             drop_connect_rate = self._global_params.drop_connect_rate
@@ -508,17 +552,18 @@ def decode_features(self, inputs):
                 # scale drop connect_rate
                 drop_connect_rate *= float(idx) / len(self._blocks)
             x = block(x, drop_connect_rate=drop_connect_rate)
+            # print(f"after block: {x.size()}")
 
         # Head
         x = self._swish(self._decoder_bn1(self._decoder_conv_head(x)))
-
+        # print(f"after decoder_conv_head: {x.size()}")
         return x
 
 
     def forward(self, inputs):
         """EfficientNet AutoEncoder's forward function.
            Calls extract_features to extract features, 
-           then calls decode features to calculate AE output.
+           then calls decode features to generates original inputs.
 
         Args:
             inputs (tensor): Input tensor.
@@ -527,14 +572,14 @@ def forward(self, inputs):
             (AE output tensor, latent representation tensor)
         """
         # Convolution layers
+        # print(f"input size: {inputs.size()}")
         x = self.extract_features(inputs)
         
         # Pooling and final linear layer
-        latent_rep = self._avg_pooling(x)
-        latent_rep = latent_rep.flatten(start_dim=1)
-        latent_rep = self._dropout(latent_rep)
-        latent_rep = self._fc(latent_rep)
-        
+        latent_rep = x.flatten(start_dim=1)
+        # print(latent_rep.size())
+
         # Deconvolution - decoder
         x = self.decode_features(x)
-        return x, latent_rep
+        # print(f"final output size: {x.size()}")
+        return x, latent_rep
\ No newline at end of file
diff --git a/efficientnet_pytorch/utils.py b/efficientnet_pytorch/utils.py
index 2a95c9a5..60d461c1 100644
--- a/efficientnet_pytorch/utils.py
+++ b/efficientnet_pytorch/utils.py
@@ -208,7 +208,10 @@ def get_same_padding_conv2d(image_size=None, transposed=False):
         Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
     """
     if transposed:
-        return TransposedConv2dDynamicSamePadding
+        if image_size is None:
+            raise NotImplementedError('Unable to dynamically upsample to odd image size.')
+        else:
+            return partial(TransposedConv2dStaticSamePadding, image_size=image_size)
 
     if image_size is None:
         return Conv2dDynamicSamePadding
@@ -279,7 +282,7 @@ def forward(self, x):
         x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
         return x
 
-class TransposedConv2dDynamicSamePadding(nn.ConvTranspose2d):
+class TransposedConv2dStaticSamePadding(nn.ConvTranspose2d):
     """2D Convolutions like TensorFlow, for a dynamic image size.
        The padding is operated in forward function by calculating dynamically.
     """
@@ -295,32 +298,41 @@ class TransposedConv2dDynamicSamePadding(nn.ConvTranspose2d):
     #     Output after ConvTranspose2d:
     #         (i-1)*s + (k-1)*d + op + 1
 
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, output_padding=0, groups=1, bias=True, dilation=1):
+    def __init__(self, in_channels, out_channels, kernel_size, image_size, stride=1, output_padding=0, groups=1, bias=True, dilation=1):
         super().__init__(in_channels, out_channels, kernel_size, stride, 0, output_padding, groups, bias, dilation)
         self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
         self.output_padding = output_padding
-
-    def forward(self, x):
-        ih, iw = x.size()[-2:]
-        kh, kw = self.weight.size()[-2:]
+        # NOTE: image_size here represents the desired output image_size
+        oh, ow = (image_size, image_size) if isinstance(image_size, int) else image_size
+        self._oh, self._ow = oh, ow
         sh, sw = self.stride
-        oh, ow = ih * sh, iw * sw # change the output size according to stride ! ! !
+        ih, iw = math.ceil(oh / sh), math.ceil(ow / sw) # using same calculation in Conv2dStaticSamePadding
+        self._ih, self._iw = ih, iw
+        kh, kw = self.weight.size()[-2:]
         # actual height/width after TransposedConv2d
         actual_oh = (ih - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + self.output_padding + 1
         actual_ow = (iw - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + self.output_padding + 1
         crop_h = actual_oh - oh
         crop_w = actual_ow - ow
         assert crop_h >= 0 and crop_w >= 0
-        
+        self._crop_h = crop_h
+        self._crop_w = crop_w
+        self._actual_oh = actual_oh
+        self._actual_ow = actual_ow
+
+    def forward(self, x):
+        # print(f" - Transposed2dStaticPadding input:{x.size()} expected:{self._ih, self._iw}")
+        # assert x.size()[-2:] == (self._ih, self._iw)
         x = F.conv_transpose2d(x, self.weight, self.bias, self.stride, self.padding,
                                   self.output_padding, self.groups, self.dilation)
-        assert x.size()[-2:] == (actual_oh,  actual_ow)
+        # assert x.size()[-2:] == (self._actual_oh,  self._actual_ow)
+        crop_h, crop_w = self._crop_h, self._crop_w
         if crop_h > 0 or crop_w > 0:
             x = x[:, :, crop_h // 2 : - (crop_h - crop_h // 2), crop_w // 2 : - (crop_w - crop_w // 2)]
-
-        assert x.size()[-2:] == (oh, ow)
+        # assert x.size()[-2:] == (self._oh, self._ow)
         return x
 
+
 def get_same_padding_maxPool2d(image_size=None):
     """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
        Static padding is necessary for ONNX exporting of models.
@@ -652,7 +664,7 @@ def load_pretrained_weights(model, model_name, weights_path=None, load_fc=True,
         # TODO: add initialization to missing layers
         missing_keys = []
         for key in ret.missing_keys:
-            if not key.startswith('_decoder'):
+            if not key.startswith(('_decoder', '_feature', '_upsample', '_downsample')):
                 missing_keys.append(key)
 
         assert not missing_keys, 'Missing keys when loading pretrained weights: {}'.format(

From 4095e2ce42264570a57fb39e7925c21ac66bacc2 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyaow@umich.edu>
Date: Mon, 28 Dec 2020 23:43:13 +0800
Subject: [PATCH 10/10] remove debug print for clarity

---
 efficientnet_pytorch/model.py | 14 --------------
 efficientnet_pytorch/utils.py |  1 -
 2 files changed, 15 deletions(-)

diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py
index 2dfe476e..a3450b33 100644
--- a/efficientnet_pytorch/model.py
+++ b/efficientnet_pytorch/model.py
@@ -296,18 +296,15 @@ def extract_features(self, inputs):
         """
         # Stem
         x = self._swish(self._bn0(self._conv_stem(inputs)))
-        # print(f"after conv_stem: {x.size()}")
         # Blocks
         for idx, block in enumerate(self._blocks):
             drop_connect_rate = self._global_params.drop_connect_rate
             if drop_connect_rate:
                 drop_connect_rate *= float(idx) / len(self._blocks) # scale drop connect_rate
             x = block(x, drop_connect_rate=drop_connect_rate)
-            # print(f"after block: {x.size()}")
 
         # Head
         x = self._swish(self._bn1(self._conv_head(x)))
-        # print(f"after conv_head: {x.size()}")
         return x
 
     def forward(self, inputs):
@@ -479,10 +476,8 @@ def __init__(self, blocks_args=None, global_params=None):
 
         # Build blocks
         self._decoder_blocks = nn.ModuleList([])
-        # print(f"foward size:\n{self._blocks_image_size}")
         assert len(self._blocks_image_size) == len(self._blocks_args) + 1
         self._blocks_image_size = list(reversed(self._blocks_image_size))
-        # print(f"backward size:\n{self._blocks_image_size}")
         for i, block_args in enumerate(reversed(self._blocks_args)):
             image_size = self._blocks_image_size[i]
             # Update block input and output filters based on depth multiplier.
@@ -492,7 +487,6 @@ def __init__(self, blocks_args=None, global_params=None):
                 output_filters=round_filters(block_args.input_filters, self._global_params),
                 num_repeat=round_repeats(block_args.num_repeat, self._global_params)
             )
-            # print(f"input filter: {block_args.input_filters}, output filter: {block_args.output_filters}")
             # The first block needs to take care of stride and filter size increase.
             self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size,
                                                     decoder_mode=True, decoder_output_image_size=self._blocks_image_size[i+1]))
@@ -523,7 +517,6 @@ def extract_features(self, inputs):
             layer in the efficientnet model.
         """
         x = super().extract_features(inputs)
-        # print(f"before downsample size: {x.size()}")
         x = self._swish(self._downsample_bn(self._feature_downsample(x)))
         return x
 
@@ -541,10 +534,8 @@ def decode_features(self, inputs):
         """
         # upsample
         x = self._swish(self._upsample_bn(self._feature_upsample(inputs)))
-        # print(f"after upsample size: {x.size()}")
         # Stem
         x = self._swish(self._decoder_bn0(self._decoder_conv_stem(x)))
-        # print(f"after decoder_conv_stem: {x.size()}")
         # Blocks
         for idx, block in enumerate(self._decoder_blocks):
             drop_connect_rate = self._global_params.drop_connect_rate
@@ -552,11 +543,9 @@ def decode_features(self, inputs):
                 # scale drop connect_rate
                 drop_connect_rate *= float(idx) / len(self._blocks)
             x = block(x, drop_connect_rate=drop_connect_rate)
-            # print(f"after block: {x.size()}")
 
         # Head
         x = self._swish(self._decoder_bn1(self._decoder_conv_head(x)))
-        # print(f"after decoder_conv_head: {x.size()}")
         return x
 
 
@@ -572,14 +561,11 @@ def forward(self, inputs):
             (AE output tensor, latent representation tensor)
         """
         # Convolution layers
-        # print(f"input size: {inputs.size()}")
         x = self.extract_features(inputs)
         
         # Pooling and final linear layer
         latent_rep = x.flatten(start_dim=1)
-        # print(latent_rep.size())
 
         # Deconvolution - decoder
         x = self.decode_features(x)
-        # print(f"final output size: {x.size()}")
         return x, latent_rep
\ No newline at end of file
diff --git a/efficientnet_pytorch/utils.py b/efficientnet_pytorch/utils.py
index 60d461c1..42bc6568 100644
--- a/efficientnet_pytorch/utils.py
+++ b/efficientnet_pytorch/utils.py
@@ -321,7 +321,6 @@ def __init__(self, in_channels, out_channels, kernel_size, image_size, stride=1,
         self._actual_ow = actual_ow
 
     def forward(self, x):
-        # print(f" - Transposed2dStaticPadding input:{x.size()} expected:{self._ih, self._iw}")
         # assert x.size()[-2:] == (self._ih, self._iw)
         x = F.conv_transpose2d(x, self.weight, self.bias, self.stride, self.padding,
                                   self.output_padding, self.groups, self.dilation)