From 5f243342999fe9f968949b6ca4bf2d3eb61db733 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 20 Dec 2020 12:11:13 +0800 Subject: [PATCH 01/10] modify model to EfficientNetAutoEncoder --- efficientnet_pytorch/model.py | 110 ++++++++++++++++++++++++++++------ efficientnet_pytorch/utils.py | 71 ++++++++++++++++++++-- 2 files changed, 156 insertions(+), 25 deletions(-) mode change 100755 => 100644 efficientnet_pytorch/model.py mode change 100755 => 100644 efficientnet_pytorch/utils.py diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py old mode 100755 new mode 100644 index e89e97b..8c776e4 --- a/efficientnet_pytorch/model.py +++ b/efficientnet_pytorch/model.py @@ -40,6 +40,7 @@ class MBConvBlock(nn.Module): block_args (namedtuple): BlockArgs, defined in utils.py. global_params (namedtuple): GlobalParam, defined in utils.py. image_size (tuple or list): [image_height, image_width]. + decoder_mode (bool): Reverse the block (deconvolution) if true. References: [1] https://arxiv.org/abs/1704.04861 (MobileNet v1) @@ -47,19 +48,20 @@ class MBConvBlock(nn.Module): [3] https://arxiv.org/abs/1905.02244 (MobileNet v3) """ - def __init__(self, block_args, global_params, image_size=None): + def __init__(self, block_args, global_params, image_size=None, decoder_mode=False): super().__init__() self._block_args = block_args self._bn_mom = 1 - global_params.batch_norm_momentum # pytorch's difference from tensorflow self._bn_eps = global_params.batch_norm_epsilon self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1) self.id_skip = block_args.id_skip # whether to use skip connection and drop connect + self.decoder_mode = decoder_mode # Expansion phase (Inverted Bottleneck) inp = self._block_args.input_filters # number of input channels oup = self._block_args.input_filters * self._block_args.expand_ratio # number of output channels if self._block_args.expand_ratio != 1: - Conv2d = get_same_padding_conv2d(image_size=image_size) + Conv2d = get_same_padding_conv2d(image_size=image_size, transposed=self.decoder_mode) self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False) self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) # image_size = calculate_output_image_size(image_size, 1) <-- this wouldn't modify image_size @@ -67,23 +69,23 @@ def __init__(self, block_args, global_params, image_size=None): # Depthwise convolution phase k = self._block_args.kernel_size s = self._block_args.stride - Conv2d = get_same_padding_conv2d(image_size=image_size) + Conv2d = get_same_padding_conv2d(image_size=image_size, transposed=self.decoder_mode) self._depthwise_conv = Conv2d( in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise kernel_size=k, stride=s, bias=False) self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) - image_size = calculate_output_image_size(image_size, s) + image_size = calculate_output_image_size(image_size, s, transposed=self.decoder_mode) # Squeeze and Excitation layer, if desired if self.has_se: - Conv2d = get_same_padding_conv2d(image_size=(1, 1)) + Conv2d = get_same_padding_conv2d(image_size=(1, 1), transposed=self.decoder_mode) num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio)) self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1) self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1) # Pointwise convolution phase final_oup = self._block_args.output_filters - Conv2d = get_same_padding_conv2d(image_size=image_size) + Conv2d = get_same_padding_conv2d(image_size=image_size, transposed=self.decoder_mode) self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False) self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps) self._swish = MemoryEfficientSwish() @@ -140,8 +142,8 @@ def set_swish(self, memory_efficient=True): self._swish = MemoryEfficientSwish() if memory_efficient else Swish() -class EfficientNet(nn.Module): - """EfficientNet model. +class EfficientNetAutoEncoder(nn.Module): + """EfficientNet AutoEncoder model. Most easily loaded with the .from_name or .from_pretrained methods. Args: @@ -173,10 +175,10 @@ def __init__(self, blocks_args=None, global_params=None): bn_mom = 1 - self._global_params.batch_norm_momentum bn_eps = self._global_params.batch_norm_epsilon + # ==== EfficientNet Encoder ==== # Get stem static or dynamic convolution depending on image size image_size = global_params.image_size Conv2d = get_same_padding_conv2d(image_size=image_size) - # Stem in_channels = 3 # rgb out_channels = round_filters(32, self._global_params) # number of output channels @@ -211,12 +213,51 @@ def __init__(self, blocks_args=None, global_params=None): self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False) self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) - # Final linear layer + # ==== Linear layer for latent space ==== self._avg_pooling = nn.AdaptiveAvgPool2d(1) self._dropout = nn.Dropout(self._global_params.dropout_rate) self._fc = nn.Linear(out_channels, self._global_params.num_classes) self._swish = MemoryEfficientSwish() + # ==== EfficientNet Decoder ==== + # use dynamic image size for decoder + TransposedConv2d = get_same_padding_conv2d(image_size=image_size, transposed=True) + + # Stem + # number of output channels from encoder model + in_channels, out_channels = out_channels, in_channels + # self._decoder_conv_stem symmetry to self._conv_head + self._decoder_conv_stem = TransposedConv2d(in_channels, out_channels, kernel_size=1, bias=False) + image_size = calculate_output_image_size(image_size, 1, transposed=True) + self._decoder_bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) + # image_size = calculate_output_image_size(image_size, 2) + + # Build blocks + self._decoder_blocks = nn.ModuleList([]) + for block_args in reversed(self._blocks_args): + + # Update block input and output filters based on depth multiplier. + # NOTE: input/output are flip here to support deconvolution + block_args = block_args._replace( + input_filters=round_filters(block_args.output_filters, self._global_params), + output_filters=round_filters(block_args.input_filters, self._global_params), + num_repeat=round_repeats(block_args.num_repeat, self._global_params) + ) + # The first block needs to take care of stride and filter size increase. + self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True)) + image_size = calculate_output_image_size(image_size, block_args.stride, transposed=True) + if block_args.num_repeat > 1: # modify block_args to keep same output size + block_args = block_args._replace(input_filters=block_args.output_filters, stride=1) + for _ in range(block_args.num_repeat - 1): + self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True)) + # image_size = calculate_output_image_size(image_size, block_args.stride) # stride = 1 + + # Head + in_channels = round_filters(32, self._global_params) # number of output channels + out_channels = 3 # rgb + self._decoder_conv_head = TransposedConv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) + self._decoder_bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) + def set_swish(self, memory_efficient=True): """Sets swish function as memory efficient (for training) or standard (for export). @@ -272,6 +313,33 @@ def extract_endpoints(self, inputs): return endpoints + def decode_features(self, inputs): + """decoder portion of this autoencoder. + + Args: + inputs (tensor): Input tensor to the decoder, + usually from self.extract_features + + Returns: + Output of the final convolution + layer in the efficientnet model. + """ + # Stem + x = self._swish(self._decoder_bn0(self._decoder_conv_stem(inputs))) + # Blocks + for idx, block in enumerate(self._decoder_blocks): + drop_connect_rate = self._global_params.drop_connect_rate + if drop_connect_rate: + # scale drop connect_rate + drop_connect_rate *= float(idx) / len(self._blocks) + x = block(x, drop_connect_rate=drop_connect_rate) + + # Head + x = self._swish(self._decoder_bn1(self._decoder_conv_head(x))) + + return x + + def extract_features(self, inputs): """use convolution layer to extract feature . @@ -298,24 +366,28 @@ def extract_features(self, inputs): return x def forward(self, inputs): - """EfficientNet's forward function. - Calls extract_features to extract features, applies final linear layer, and returns logits. + """EfficientNet AutoEncoder's forward function. + Calls extract_features to extract features, + then calls decode features to generates original inputs. Args: inputs (tensor): Input tensor. Returns: - Output of this model after processing. + (AE output tensor, latent representation tensor) """ # Convolution layers x = self.extract_features(inputs) + # Pooling and final linear layer - x = self._avg_pooling(x) - if self._global_params.include_top: - x = x.flatten(start_dim=1) - x = self._dropout(x) - x = self._fc(x) - return x + latent_rep = self._avg_pooling(x) + latent_rep = latent_rep.flatten(start_dim=1) + latent_rep = self._dropout(latent_rep) + latent_rep = self._fc(latent_rep) + + # Deconvolution - decoder + x = self.decode_features(x) + return x, latent_rep @classmethod def from_name(cls, model_name, in_channels=3, **override_params): diff --git a/efficientnet_pytorch/utils.py b/efficientnet_pytorch/utils.py old mode 100755 new mode 100644 index 6a84345..2a95c9a --- a/efficientnet_pytorch/utils.py +++ b/efficientnet_pytorch/utils.py @@ -167,7 +167,7 @@ def get_width_and_height_from_size(x): raise TypeError() -def calculate_output_image_size(input_image_size, stride): +def calculate_output_image_size(input_image_size, stride, transposed=False): """Calculates the output image size when using Conv2dSamePadding with a stride. Necessary for static padding. Thanks to mannatsingh for pointing this out. @@ -182,8 +182,12 @@ def calculate_output_image_size(input_image_size, stride): return None image_height, image_width = get_width_and_height_from_size(input_image_size) stride = stride if isinstance(stride, int) else stride[0] - image_height = int(math.ceil(image_height / stride)) - image_width = int(math.ceil(image_width / stride)) + if transposed: + image_height = int(image_height * stride) + image_width = int(image_width * stride) + else: + image_height = int(math.ceil(image_height / stride)) + image_width = int(math.ceil(image_width / stride)) return [image_height, image_width] @@ -192,16 +196,20 @@ def calculate_output_image_size(input_image_size, stride): # Only when stride equals 1, can the output size be the same as input size. # Don't be confused by their function names ! ! ! -def get_same_padding_conv2d(image_size=None): +def get_same_padding_conv2d(image_size=None, transposed=False): """Chooses static padding if you have specified an image size, and dynamic padding otherwise. Static padding is necessary for ONNX exporting of models. Args: image_size (int or tuple): Size of the image. + transposed (bool): use nn.functional.conv_transpose2d if true, and nn.functional.conv2d otherwise. Returns: Conv2dDynamicSamePadding or Conv2dStaticSamePadding. """ + if transposed: + return TransposedConv2dDynamicSamePadding + if image_size is None: return Conv2dDynamicSamePadding else: @@ -271,6 +279,47 @@ def forward(self, x): x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) return x +class TransposedConv2dDynamicSamePadding(nn.ConvTranspose2d): + """2D Convolutions like TensorFlow, for a dynamic image size. + The padding is operated in forward function by calculating dynamically. + """ + + # Tips for 'SAME' mode padding. + # Given the following: + # i: width or height + # s: stride + # k: kernel size + # d: dilation + # p: padding + # op: output padding + # Output after ConvTranspose2d: + # (i-1)*s + (k-1)*d + op + 1 + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, output_padding=0, groups=1, bias=True, dilation=1): + super().__init__(in_channels, out_channels, kernel_size, stride, 0, output_padding, groups, bias, dilation) + self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2 + self.output_padding = output_padding + + def forward(self, x): + ih, iw = x.size()[-2:] + kh, kw = self.weight.size()[-2:] + sh, sw = self.stride + oh, ow = ih * sh, iw * sw # change the output size according to stride ! ! ! + # actual height/width after TransposedConv2d + actual_oh = (ih - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + self.output_padding + 1 + actual_ow = (iw - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + self.output_padding + 1 + crop_h = actual_oh - oh + crop_w = actual_ow - ow + assert crop_h >= 0 and crop_w >= 0 + + x = F.conv_transpose2d(x, self.weight, self.bias, self.stride, self.padding, + self.output_padding, self.groups, self.dilation) + assert x.size()[-2:] == (actual_oh, actual_ow) + if crop_h > 0 or crop_w > 0: + x = x[:, :, crop_h // 2 : - (crop_h - crop_h // 2), crop_w // 2 : - (crop_w - crop_w // 2)] + + assert x.size()[-2:] == (oh, ow) + return x def get_same_padding_maxPool2d(image_size=None): """Chooses static padding if you have specified an image size, and dynamic padding otherwise. @@ -598,13 +647,23 @@ def load_pretrained_weights(model, model_name, weights_path=None, load_fc=True, if load_fc: ret = model.load_state_dict(state_dict, strict=False) - assert not ret.missing_keys, 'Missing keys when loading pretrained weights: {}'.format(ret.missing_keys) + + # weights for decoder are not loaded + # TODO: add initialization to missing layers + missing_keys = [] + for key in ret.missing_keys: + if not key.startswith('_decoder'): + missing_keys.append(key) + + assert not missing_keys, 'Missing keys when loading pretrained weights: {}'.format( + missing_keys) else: state_dict.pop('_fc.weight') state_dict.pop('_fc.bias') ret = model.load_state_dict(state_dict, strict=False) assert set(ret.missing_keys) == set( ['_fc.weight', '_fc.bias']), 'Missing keys when loading pretrained weights: {}'.format(ret.missing_keys) - assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format(ret.unexpected_keys) + assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format( + ret.unexpected_keys) print('Loaded pretrained weights for {}'.format(model_name)) From ff4dec4aafb36327cd6956db93c127c6b99e5683 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 20 Dec 2020 12:14:12 +0800 Subject: [PATCH 02/10] add comments --- efficientnet_pytorch/model.py | 4 ++-- efficientnet_pytorch/utils.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py index 8c776e4..e5f50ad 100644 --- a/efficientnet_pytorch/model.py +++ b/efficientnet_pytorch/model.py @@ -159,9 +159,9 @@ class EfficientNetAutoEncoder(nn.Module): import torch >>> from efficientnet.model import EfficientNet >>> inputs = torch.rand(1, 3, 224, 224) - >>> model = EfficientNet.from_pretrained('efficientnet-b0') + >>> model = EfficientNetAutoEncoder.from_pretrained('efficientnet-b0') >>> model.eval() - >>> outputs = model(inputs) + >>> ae_output, latent_fc_output = model(inputs) """ def __init__(self, blocks_args=None, global_params=None): diff --git a/efficientnet_pytorch/utils.py b/efficientnet_pytorch/utils.py index 2a95c9a..7871a87 100644 --- a/efficientnet_pytorch/utils.py +++ b/efficientnet_pytorch/utils.py @@ -654,6 +654,7 @@ def load_pretrained_weights(model, model_name, weights_path=None, load_fc=True, for key in ret.missing_keys: if not key.startswith('_decoder'): missing_keys.append(key) + print(f"Weights for _decoder keys are not loaded.") assert not missing_keys, 'Missing keys when loading pretrained weights: {}'.format( missing_keys) From 4d64d871d550b35dc4eb508a174b6a114722b9b1 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 20 Dec 2020 12:22:51 +0800 Subject: [PATCH 03/10] make compatible with original EfficientNet --- efficientnet_pytorch/model.py | 220 ++++++++++++++++++++-------------- efficientnet_pytorch/utils.py | 1 - 2 files changed, 132 insertions(+), 89 deletions(-) diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py index e5f50ad..b9ea59c 100644 --- a/efficientnet_pytorch/model.py +++ b/efficientnet_pytorch/model.py @@ -142,8 +142,8 @@ def set_swish(self, memory_efficient=True): self._swish = MemoryEfficientSwish() if memory_efficient else Swish() -class EfficientNetAutoEncoder(nn.Module): - """EfficientNet AutoEncoder model. +class EfficientNet(nn.Module): + """EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods. Args: @@ -154,14 +154,12 @@ class EfficientNetAutoEncoder(nn.Module): [1] https://arxiv.org/abs/1905.11946 (EfficientNet) Example: - - - import torch + >>> import torch >>> from efficientnet.model import EfficientNet >>> inputs = torch.rand(1, 3, 224, 224) - >>> model = EfficientNetAutoEncoder.from_pretrained('efficientnet-b0') + >>> model = EfficientNet.from_pretrained('efficientnet-b0') >>> model.eval() - >>> ae_output, latent_fc_output = model(inputs) + >>> outputs = model(inputs) """ def __init__(self, blocks_args=None, global_params=None): @@ -175,10 +173,10 @@ def __init__(self, blocks_args=None, global_params=None): bn_mom = 1 - self._global_params.batch_norm_momentum bn_eps = self._global_params.batch_norm_epsilon - # ==== EfficientNet Encoder ==== # Get stem static or dynamic convolution depending on image size image_size = global_params.image_size Conv2d = get_same_padding_conv2d(image_size=image_size) + # Stem in_channels = 3 # rgb out_channels = round_filters(32, self._global_params) # number of output channels @@ -213,51 +211,12 @@ def __init__(self, blocks_args=None, global_params=None): self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False) self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) - # ==== Linear layer for latent space ==== + # Final linear layer self._avg_pooling = nn.AdaptiveAvgPool2d(1) self._dropout = nn.Dropout(self._global_params.dropout_rate) self._fc = nn.Linear(out_channels, self._global_params.num_classes) self._swish = MemoryEfficientSwish() - # ==== EfficientNet Decoder ==== - # use dynamic image size for decoder - TransposedConv2d = get_same_padding_conv2d(image_size=image_size, transposed=True) - - # Stem - # number of output channels from encoder model - in_channels, out_channels = out_channels, in_channels - # self._decoder_conv_stem symmetry to self._conv_head - self._decoder_conv_stem = TransposedConv2d(in_channels, out_channels, kernel_size=1, bias=False) - image_size = calculate_output_image_size(image_size, 1, transposed=True) - self._decoder_bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) - # image_size = calculate_output_image_size(image_size, 2) - - # Build blocks - self._decoder_blocks = nn.ModuleList([]) - for block_args in reversed(self._blocks_args): - - # Update block input and output filters based on depth multiplier. - # NOTE: input/output are flip here to support deconvolution - block_args = block_args._replace( - input_filters=round_filters(block_args.output_filters, self._global_params), - output_filters=round_filters(block_args.input_filters, self._global_params), - num_repeat=round_repeats(block_args.num_repeat, self._global_params) - ) - # The first block needs to take care of stride and filter size increase. - self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True)) - image_size = calculate_output_image_size(image_size, block_args.stride, transposed=True) - if block_args.num_repeat > 1: # modify block_args to keep same output size - block_args = block_args._replace(input_filters=block_args.output_filters, stride=1) - for _ in range(block_args.num_repeat - 1): - self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True)) - # image_size = calculate_output_image_size(image_size, block_args.stride) # stride = 1 - - # Head - in_channels = round_filters(32, self._global_params) # number of output channels - out_channels = 3 # rgb - self._decoder_conv_head = TransposedConv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) - self._decoder_bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) - def set_swish(self, memory_efficient=True): """Sets swish function as memory efficient (for training) or standard (for export). @@ -313,33 +272,6 @@ def extract_endpoints(self, inputs): return endpoints - def decode_features(self, inputs): - """decoder portion of this autoencoder. - - Args: - inputs (tensor): Input tensor to the decoder, - usually from self.extract_features - - Returns: - Output of the final convolution - layer in the efficientnet model. - """ - # Stem - x = self._swish(self._decoder_bn0(self._decoder_conv_stem(inputs))) - # Blocks - for idx, block in enumerate(self._decoder_blocks): - drop_connect_rate = self._global_params.drop_connect_rate - if drop_connect_rate: - # scale drop connect_rate - drop_connect_rate *= float(idx) / len(self._blocks) - x = block(x, drop_connect_rate=drop_connect_rate) - - # Head - x = self._swish(self._decoder_bn1(self._decoder_conv_head(x))) - - return x - - def extract_features(self, inputs): """use convolution layer to extract feature . @@ -366,28 +298,25 @@ def extract_features(self, inputs): return x def forward(self, inputs): - """EfficientNet AutoEncoder's forward function. - Calls extract_features to extract features, - then calls decode features to generates original inputs. + """EfficientNet's forward function. + Calls extract_features to extract features, applies final linear layer, and returns logits. Args: inputs (tensor): Input tensor. Returns: - (AE output tensor, latent representation tensor) + Output of this model after processing. """ # Convolution layers x = self.extract_features(inputs) - + # Pooling and final linear layer - latent_rep = self._avg_pooling(x) - latent_rep = latent_rep.flatten(start_dim=1) - latent_rep = self._dropout(latent_rep) - latent_rep = self._fc(latent_rep) - - # Deconvolution - decoder - x = self.decode_features(x) - return x, latent_rep + x = self._avg_pooling(x) + x = x.flatten(start_dim=1) + x = self._dropout(x) + x = self._fc(x) + + return x @classmethod def from_name(cls, model_name, in_channels=3, **override_params): @@ -485,3 +414,118 @@ def _change_in_channels(self, in_channels): Conv2d = get_same_padding_conv2d(image_size=self._global_params.image_size) out_channels = round_filters(32, self._global_params) self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) + +class EfficientNetAutoEncoder(EfficientNet): + """EfficientNet AutoEncoder model. + Most easily loaded with the .from_name or .from_pretrained methods. + + Args: + blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks. + global_params (namedtuple): A set of GlobalParams shared between blocks. + + References: + [1] https://arxiv.org/abs/1905.11946 (EfficientNet) + + Example: + + + import torch + >>> from efficientnet.model import EfficientNet + >>> inputs = torch.rand(1, 3, 224, 224) + >>> model = EfficientNetAutoEncoder.from_pretrained('efficientnet-b0') + >>> model.eval() + >>> ae_output, latent_fc_output = model(inputs) + """ + + def __init__(self, blocks_args=None, global_params=None): + super().__init__(blocks_args=blocks_args, global_params=global_params) + + # EfficientNet Decoder + # use dynamic image size for decoder + TransposedConv2d = get_same_padding_conv2d(image_size=image_size, transposed=True) + + # Stem + # number of output channels from encoder model + in_channels, out_channels = out_channels, in_channels + # self._decoder_conv_stem symmetry to self._conv_head + self._decoder_conv_stem = TransposedConv2d(in_channels, out_channels, kernel_size=1, bias=False) + image_size = calculate_output_image_size(image_size, 1, transposed=True) + self._decoder_bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) + # image_size = calculate_output_image_size(image_size, 2) + + # Build blocks + self._decoder_blocks = nn.ModuleList([]) + for block_args in reversed(self._blocks_args): + + # Update block input and output filters based on depth multiplier. + # NOTE: input/output are flip here to support deconvolution + block_args = block_args._replace( + input_filters=round_filters(block_args.output_filters, self._global_params), + output_filters=round_filters(block_args.input_filters, self._global_params), + num_repeat=round_repeats(block_args.num_repeat, self._global_params) + ) + # The first block needs to take care of stride and filter size increase. + self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True)) + image_size = calculate_output_image_size(image_size, block_args.stride, transposed=True) + if block_args.num_repeat > 1: # modify block_args to keep same output size + block_args = block_args._replace(input_filters=block_args.output_filters, stride=1) + for _ in range(block_args.num_repeat - 1): + self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True)) + # image_size = calculate_output_image_size(image_size, block_args.stride) # stride = 1 + + # Head + in_channels = round_filters(32, self._global_params) # number of output channels + out_channels = 3 # rgb + self._decoder_conv_head = TransposedConv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) + self._decoder_bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) + + def decode_features(self, inputs): + """decoder portion of this autoencoder. + + Args: + inputs (tensor): Input tensor to the decoder, + usually from self.extract_features + + Returns: + Output of the final convolution + layer in the efficientnet model. + """ + # Stem + x = self._swish(self._decoder_bn0(self._decoder_conv_stem(inputs))) + # Blocks + for idx, block in enumerate(self._decoder_blocks): + drop_connect_rate = self._global_params.drop_connect_rate + if drop_connect_rate: + # scale drop connect_rate + drop_connect_rate *= float(idx) / len(self._blocks) + x = block(x, drop_connect_rate=drop_connect_rate) + + # Head + x = self._swish(self._decoder_bn1(self._decoder_conv_head(x))) + + return x + + + def forward(self, inputs): + """EfficientNet AutoEncoder's forward function. + Calls extract_features to extract features, + then calls decode features to generates original inputs. + + Args: + inputs (tensor): Input tensor. + + Returns: + (AE output tensor, latent representation tensor) + """ + # Convolution layers + x = self.extract_features(inputs) + + # Pooling and final linear layer + latent_rep = self._avg_pooling(x) + latent_rep = latent_rep.flatten(start_dim=1) + latent_rep = self._dropout(latent_rep) + latent_rep = self._fc(latent_rep) + + # Deconvolution - decoder + x = self.decode_features(x) + return x, latent_rep diff --git a/efficientnet_pytorch/utils.py b/efficientnet_pytorch/utils.py index 7871a87..2a95c9a 100644 --- a/efficientnet_pytorch/utils.py +++ b/efficientnet_pytorch/utils.py @@ -654,7 +654,6 @@ def load_pretrained_weights(model, model_name, weights_path=None, load_fc=True, for key in ret.missing_keys: if not key.startswith('_decoder'): missing_keys.append(key) - print(f"Weights for _decoder keys are not loaded.") assert not missing_keys, 'Missing keys when loading pretrained weights: {}'.format( missing_keys) From b975d38de9a189964619c3cddbdf1f5ba1f79165 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 20 Dec 2020 13:09:29 +0800 Subject: [PATCH 04/10] revert forward func of EfficientNet --- efficientnet_pytorch/model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py index b9ea59c..79e7da7 100644 --- a/efficientnet_pytorch/model.py +++ b/efficientnet_pytorch/model.py @@ -312,10 +312,10 @@ def forward(self, inputs): # Pooling and final linear layer x = self._avg_pooling(x) - x = x.flatten(start_dim=1) - x = self._dropout(x) - x = self._fc(x) - + if self._global_params.include_top: + x = x.flatten(start_dim=1) + x = self._dropout(x) + x = self._fc(x) return x @classmethod From e8648d83bfb1dfd0d5e54545ab5f5841f9a77816 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 20 Dec 2020 13:11:22 +0800 Subject: [PATCH 05/10] fix indentation of EfficientNet --- efficientnet_pytorch/model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py index 79e7da7..a8db4ef 100644 --- a/efficientnet_pytorch/model.py +++ b/efficientnet_pytorch/model.py @@ -239,7 +239,9 @@ def extract_endpoints(self, inputs): Dictionary of last intermediate features with reduction levels i in [1, 2, 3, 4, 5]. Example: - >>> import torch + + + import torch >>> from efficientnet.model import EfficientNet >>> inputs = torch.rand(1, 3, 224, 224) >>> model = EfficientNet.from_pretrained('efficientnet-b0') From b34fb73d67739a61b5eff76c54b33da3531a30f7 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 20 Dec 2020 23:03:30 +0800 Subject: [PATCH 06/10] fixed variable --- efficientnet_pytorch/model.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py index a8db4ef..c99c5c9 100644 --- a/efficientnet_pytorch/model.py +++ b/efficientnet_pytorch/model.py @@ -170,8 +170,8 @@ def __init__(self, blocks_args=None, global_params=None): self._blocks_args = blocks_args # Batch norm parameters - bn_mom = 1 - self._global_params.batch_norm_momentum - bn_eps = self._global_params.batch_norm_epsilon + self._bn_mom = bn_mom = 1 - self._global_params.batch_norm_momentum + self._bn_eps = bn_eps = self._global_params.batch_norm_epsilon # Get stem static or dynamic convolution depending on image size image_size = global_params.image_size @@ -217,6 +217,8 @@ def __init__(self, blocks_args=None, global_params=None): self._fc = nn.Linear(out_channels, self._global_params.num_classes) self._swish = MemoryEfficientSwish() + self._image_size = image_size + def set_swish(self, memory_efficient=True): """Sets swish function as memory efficient (for training) or standard (for export). @@ -241,7 +243,7 @@ def extract_endpoints(self, inputs): Example: - import torch + >>> import torch >>> from efficientnet.model import EfficientNet >>> inputs = torch.rand(1, 3, 224, 224) >>> model = EfficientNet.from_pretrained('efficientnet-b0') @@ -431,7 +433,7 @@ class EfficientNetAutoEncoder(EfficientNet): Example: - import torch + >>> import torch >>> from efficientnet.model import EfficientNet >>> inputs = torch.rand(1, 3, 224, 224) >>> model = EfficientNetAutoEncoder.from_pretrained('efficientnet-b0') @@ -441,15 +443,18 @@ class EfficientNetAutoEncoder(EfficientNet): def __init__(self, blocks_args=None, global_params=None): super().__init__(blocks_args=blocks_args, global_params=global_params) - + bn_mom = self._bn_mon + bn_eps = self._bn_eps + image_size = self._image_size + # EfficientNet Decoder # use dynamic image size for decoder TransposedConv2d = get_same_padding_conv2d(image_size=image_size, transposed=True) # Stem - # number of output channels from encoder model - in_channels, out_channels = out_channels, in_channels # self._decoder_conv_stem symmetry to self._conv_head + in_channels = round_filters(1280, self._global_params) + out_channels = block_args.output_filters # output of final block self._decoder_conv_stem = TransposedConv2d(in_channels, out_channels, kernel_size=1, bias=False) image_size = calculate_output_image_size(image_size, 1, transposed=True) self._decoder_bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) @@ -460,7 +465,7 @@ def __init__(self, blocks_args=None, global_params=None): for block_args in reversed(self._blocks_args): # Update block input and output filters based on depth multiplier. - # NOTE: input/output are flip here to support deconvolution + # input/output are flip here to support deconvolution block_args = block_args._replace( input_filters=round_filters(block_args.output_filters, self._global_params), output_filters=round_filters(block_args.input_filters, self._global_params), From 52067bd550c12faec6b4049fa79cf52c4dd79334 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 20 Dec 2020 23:11:38 +0800 Subject: [PATCH 07/10] fixed variable --- efficientnet_pytorch/model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py index c99c5c9..3bca7b7 100644 --- a/efficientnet_pytorch/model.py +++ b/efficientnet_pytorch/model.py @@ -218,6 +218,7 @@ def __init__(self, blocks_args=None, global_params=None): self._swish = MemoryEfficientSwish() self._image_size = image_size + self._last_block_args = block_args def set_swish(self, memory_efficient=True): """Sets swish function as memory efficient (for training) or standard (for export). @@ -443,9 +444,10 @@ class EfficientNetAutoEncoder(EfficientNet): def __init__(self, blocks_args=None, global_params=None): super().__init__(blocks_args=blocks_args, global_params=global_params) - bn_mom = self._bn_mon + bn_mom = self._bn_mom bn_eps = self._bn_eps image_size = self._image_size + block_args = self._last_block_args # EfficientNet Decoder # use dynamic image size for decoder From f974bad36d8529eebcdb5af01263b080d29cdd54 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 20 Dec 2020 23:17:15 +0800 Subject: [PATCH 08/10] modify comments --- efficientnet_pytorch/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py index 3bca7b7..870dfd6 100644 --- a/efficientnet_pytorch/model.py +++ b/efficientnet_pytorch/model.py @@ -518,7 +518,7 @@ def decode_features(self, inputs): def forward(self, inputs): """EfficientNet AutoEncoder's forward function. Calls extract_features to extract features, - then calls decode features to generates original inputs. + then calls decode features to calculate AE output. Args: inputs (tensor): Input tensor. From 16e0633386c6190af2884b0125c35a4fb306256b Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 28 Dec 2020 23:41:45 +0800 Subject: [PATCH 09/10] add TransposedConv2dStaticSamePadding to fix TransposedConv2d to odd image size issue; add latent feature by down/upsampling between encoder and decoder; --- efficientnet_pytorch/model.py | 81 +++++++++++++++++++++++++++-------- efficientnet_pytorch/utils.py | 38 ++++++++++------ 2 files changed, 88 insertions(+), 31 deletions(-) diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py index 870dfd6..2dfe476 100644 --- a/efficientnet_pytorch/model.py +++ b/efficientnet_pytorch/model.py @@ -48,7 +48,7 @@ class MBConvBlock(nn.Module): [3] https://arxiv.org/abs/1905.02244 (MobileNet v3) """ - def __init__(self, block_args, global_params, image_size=None, decoder_mode=False): + def __init__(self, block_args, global_params, image_size=None, decoder_mode=False, decoder_output_image_size=None): super().__init__() self._block_args = block_args self._bn_mom = 1 - global_params.batch_norm_momentum # pytorch's difference from tensorflow @@ -69,12 +69,16 @@ def __init__(self, block_args, global_params, image_size=None, decoder_mode=Fals # Depthwise convolution phase k = self._block_args.kernel_size s = self._block_args.stride + if self.decoder_mode: + # assert decoder_output_image_size + image_size = decoder_output_image_size Conv2d = get_same_padding_conv2d(image_size=image_size, transposed=self.decoder_mode) self._depthwise_conv = Conv2d( in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise kernel_size=k, stride=s, bias=False) self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) - image_size = calculate_output_image_size(image_size, s, transposed=self.decoder_mode) + if not self.decoder_mode: + image_size = calculate_output_image_size(image_size, s) # Squeeze and Excitation layer, if desired if self.has_se: @@ -186,6 +190,7 @@ def __init__(self, blocks_args=None, global_params=None): # Build blocks self._blocks = nn.ModuleList([]) + self._blocks_image_size = [image_size] for block_args in self._blocks_args: # Update block input and output filters based on depth multiplier. @@ -198,6 +203,7 @@ def __init__(self, blocks_args=None, global_params=None): # The first block needs to take care of stride and filter size increase. self._blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size)) image_size = calculate_output_image_size(image_size, block_args.stride) + self._blocks_image_size.append(image_size) if block_args.num_repeat > 1: # modify block_args to keep same output size block_args = block_args._replace(input_filters=block_args.output_filters, stride=1) for _ in range(block_args.num_repeat - 1): @@ -219,6 +225,7 @@ def __init__(self, blocks_args=None, global_params=None): self._image_size = image_size self._last_block_args = block_args + self._last_out_channels = out_channels def set_swish(self, memory_efficient=True): """Sets swish function as memory efficient (for training) or standard (for export). @@ -289,17 +296,18 @@ def extract_features(self, inputs): """ # Stem x = self._swish(self._bn0(self._conv_stem(inputs))) - + # print(f"after conv_stem: {x.size()}") # Blocks for idx, block in enumerate(self._blocks): drop_connect_rate = self._global_params.drop_connect_rate if drop_connect_rate: drop_connect_rate *= float(idx) / len(self._blocks) # scale drop connect_rate x = block(x, drop_connect_rate=drop_connect_rate) + # print(f"after block: {x.size()}") # Head x = self._swish(self._bn1(self._conv_head(x))) - + # print(f"after conv_head: {x.size()}") return x def forward(self, inputs): @@ -448,6 +456,13 @@ def __init__(self, blocks_args=None, global_params=None): bn_eps = self._bn_eps image_size = self._image_size block_args = self._last_block_args + + Conv2d = get_same_padding_conv2d(image_size=image_size) + self._feature_downsample = Conv2d(self._last_out_channels, 8, kernel_size=1, bias=False) + self._downsample_bn = nn.BatchNorm2d(num_features=8, momentum=bn_mom, eps=bn_eps) + self._feature_upsample = Conv2d(8, self._last_out_channels, kernel_size=1, bias=False) + self._upsample_bn = nn.BatchNorm2d(num_features=self._last_out_channels, momentum=bn_mom, eps=bn_eps) + self.feature_size = 8 * image_size[0]**2 # EfficientNet Decoder # use dynamic image size for decoder @@ -464,8 +479,12 @@ def __init__(self, blocks_args=None, global_params=None): # Build blocks self._decoder_blocks = nn.ModuleList([]) - for block_args in reversed(self._blocks_args): - + # print(f"foward size:\n{self._blocks_image_size}") + assert len(self._blocks_image_size) == len(self._blocks_args) + 1 + self._blocks_image_size = list(reversed(self._blocks_image_size)) + # print(f"backward size:\n{self._blocks_image_size}") + for i, block_args in enumerate(reversed(self._blocks_args)): + image_size = self._blocks_image_size[i] # Update block input and output filters based on depth multiplier. # input/output are flip here to support deconvolution block_args = block_args._replace( @@ -473,21 +492,42 @@ def __init__(self, blocks_args=None, global_params=None): output_filters=round_filters(block_args.input_filters, self._global_params), num_repeat=round_repeats(block_args.num_repeat, self._global_params) ) + # print(f"input filter: {block_args.input_filters}, output filter: {block_args.output_filters}") # The first block needs to take care of stride and filter size increase. - self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True)) - image_size = calculate_output_image_size(image_size, block_args.stride, transposed=True) + self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, + decoder_mode=True, decoder_output_image_size=self._blocks_image_size[i+1])) + image_size = self._blocks_image_size[i+1] if block_args.num_repeat > 1: # modify block_args to keep same output size block_args = block_args._replace(input_filters=block_args.output_filters, stride=1) for _ in range(block_args.num_repeat - 1): - self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True)) + self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, + decoder_mode=True, decoder_output_image_size=image_size)) # image_size = calculate_output_image_size(image_size, block_args.stride) # stride = 1 # Head in_channels = round_filters(32, self._global_params) # number of output channels out_channels = 3 # rgb + TransposedConv2d = get_same_padding_conv2d(image_size=global_params.image_size, transposed=True) self._decoder_conv_head = TransposedConv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) self._decoder_bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) + def extract_features(self, inputs): + """use convolution layer to extract feature, + with additional down-sample layer to get 1280 hidden feature. + + Args: + inputs (tensor): Input tensor. + + Returns: + Output of the final convolution + layer in the efficientnet model. + """ + x = super().extract_features(inputs) + # print(f"before downsample size: {x.size()}") + x = self._swish(self._downsample_bn(self._feature_downsample(x))) + return x + + def decode_features(self, inputs): """decoder portion of this autoencoder. @@ -499,8 +539,12 @@ def decode_features(self, inputs): Output of the final convolution layer in the efficientnet model. """ + # upsample + x = self._swish(self._upsample_bn(self._feature_upsample(inputs))) + # print(f"after upsample size: {x.size()}") # Stem - x = self._swish(self._decoder_bn0(self._decoder_conv_stem(inputs))) + x = self._swish(self._decoder_bn0(self._decoder_conv_stem(x))) + # print(f"after decoder_conv_stem: {x.size()}") # Blocks for idx, block in enumerate(self._decoder_blocks): drop_connect_rate = self._global_params.drop_connect_rate @@ -508,17 +552,18 @@ def decode_features(self, inputs): # scale drop connect_rate drop_connect_rate *= float(idx) / len(self._blocks) x = block(x, drop_connect_rate=drop_connect_rate) + # print(f"after block: {x.size()}") # Head x = self._swish(self._decoder_bn1(self._decoder_conv_head(x))) - + # print(f"after decoder_conv_head: {x.size()}") return x def forward(self, inputs): """EfficientNet AutoEncoder's forward function. Calls extract_features to extract features, - then calls decode features to calculate AE output. + then calls decode features to generates original inputs. Args: inputs (tensor): Input tensor. @@ -527,14 +572,14 @@ def forward(self, inputs): (AE output tensor, latent representation tensor) """ # Convolution layers + # print(f"input size: {inputs.size()}") x = self.extract_features(inputs) # Pooling and final linear layer - latent_rep = self._avg_pooling(x) - latent_rep = latent_rep.flatten(start_dim=1) - latent_rep = self._dropout(latent_rep) - latent_rep = self._fc(latent_rep) - + latent_rep = x.flatten(start_dim=1) + # print(latent_rep.size()) + # Deconvolution - decoder x = self.decode_features(x) - return x, latent_rep + # print(f"final output size: {x.size()}") + return x, latent_rep \ No newline at end of file diff --git a/efficientnet_pytorch/utils.py b/efficientnet_pytorch/utils.py index 2a95c9a..60d461c 100644 --- a/efficientnet_pytorch/utils.py +++ b/efficientnet_pytorch/utils.py @@ -208,7 +208,10 @@ def get_same_padding_conv2d(image_size=None, transposed=False): Conv2dDynamicSamePadding or Conv2dStaticSamePadding. """ if transposed: - return TransposedConv2dDynamicSamePadding + if image_size is None: + raise NotImplementedError('Unable to dynamically upsample to odd image size.') + else: + return partial(TransposedConv2dStaticSamePadding, image_size=image_size) if image_size is None: return Conv2dDynamicSamePadding @@ -279,7 +282,7 @@ def forward(self, x): x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) return x -class TransposedConv2dDynamicSamePadding(nn.ConvTranspose2d): +class TransposedConv2dStaticSamePadding(nn.ConvTranspose2d): """2D Convolutions like TensorFlow, for a dynamic image size. The padding is operated in forward function by calculating dynamically. """ @@ -295,32 +298,41 @@ class TransposedConv2dDynamicSamePadding(nn.ConvTranspose2d): # Output after ConvTranspose2d: # (i-1)*s + (k-1)*d + op + 1 - def __init__(self, in_channels, out_channels, kernel_size, stride=1, output_padding=0, groups=1, bias=True, dilation=1): + def __init__(self, in_channels, out_channels, kernel_size, image_size, stride=1, output_padding=0, groups=1, bias=True, dilation=1): super().__init__(in_channels, out_channels, kernel_size, stride, 0, output_padding, groups, bias, dilation) self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2 self.output_padding = output_padding - - def forward(self, x): - ih, iw = x.size()[-2:] - kh, kw = self.weight.size()[-2:] + # NOTE: image_size here represents the desired output image_size + oh, ow = (image_size, image_size) if isinstance(image_size, int) else image_size + self._oh, self._ow = oh, ow sh, sw = self.stride - oh, ow = ih * sh, iw * sw # change the output size according to stride ! ! ! + ih, iw = math.ceil(oh / sh), math.ceil(ow / sw) # using same calculation in Conv2dStaticSamePadding + self._ih, self._iw = ih, iw + kh, kw = self.weight.size()[-2:] # actual height/width after TransposedConv2d actual_oh = (ih - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + self.output_padding + 1 actual_ow = (iw - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + self.output_padding + 1 crop_h = actual_oh - oh crop_w = actual_ow - ow assert crop_h >= 0 and crop_w >= 0 - + self._crop_h = crop_h + self._crop_w = crop_w + self._actual_oh = actual_oh + self._actual_ow = actual_ow + + def forward(self, x): + # print(f" - Transposed2dStaticPadding input:{x.size()} expected:{self._ih, self._iw}") + # assert x.size()[-2:] == (self._ih, self._iw) x = F.conv_transpose2d(x, self.weight, self.bias, self.stride, self.padding, self.output_padding, self.groups, self.dilation) - assert x.size()[-2:] == (actual_oh, actual_ow) + # assert x.size()[-2:] == (self._actual_oh, self._actual_ow) + crop_h, crop_w = self._crop_h, self._crop_w if crop_h > 0 or crop_w > 0: x = x[:, :, crop_h // 2 : - (crop_h - crop_h // 2), crop_w // 2 : - (crop_w - crop_w // 2)] - - assert x.size()[-2:] == (oh, ow) + # assert x.size()[-2:] == (self._oh, self._ow) return x + def get_same_padding_maxPool2d(image_size=None): """Chooses static padding if you have specified an image size, and dynamic padding otherwise. Static padding is necessary for ONNX exporting of models. @@ -652,7 +664,7 @@ def load_pretrained_weights(model, model_name, weights_path=None, load_fc=True, # TODO: add initialization to missing layers missing_keys = [] for key in ret.missing_keys: - if not key.startswith('_decoder'): + if not key.startswith(('_decoder', '_feature', '_upsample', '_downsample')): missing_keys.append(key) assert not missing_keys, 'Missing keys when loading pretrained weights: {}'.format( From 4095e2ce42264570a57fb39e7925c21ac66bacc2 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 28 Dec 2020 23:43:13 +0800 Subject: [PATCH 10/10] remove debug print for clarity --- efficientnet_pytorch/model.py | 14 -------------- efficientnet_pytorch/utils.py | 1 - 2 files changed, 15 deletions(-) diff --git a/efficientnet_pytorch/model.py b/efficientnet_pytorch/model.py index 2dfe476..a3450b3 100644 --- a/efficientnet_pytorch/model.py +++ b/efficientnet_pytorch/model.py @@ -296,18 +296,15 @@ def extract_features(self, inputs): """ # Stem x = self._swish(self._bn0(self._conv_stem(inputs))) - # print(f"after conv_stem: {x.size()}") # Blocks for idx, block in enumerate(self._blocks): drop_connect_rate = self._global_params.drop_connect_rate if drop_connect_rate: drop_connect_rate *= float(idx) / len(self._blocks) # scale drop connect_rate x = block(x, drop_connect_rate=drop_connect_rate) - # print(f"after block: {x.size()}") # Head x = self._swish(self._bn1(self._conv_head(x))) - # print(f"after conv_head: {x.size()}") return x def forward(self, inputs): @@ -479,10 +476,8 @@ def __init__(self, blocks_args=None, global_params=None): # Build blocks self._decoder_blocks = nn.ModuleList([]) - # print(f"foward size:\n{self._blocks_image_size}") assert len(self._blocks_image_size) == len(self._blocks_args) + 1 self._blocks_image_size = list(reversed(self._blocks_image_size)) - # print(f"backward size:\n{self._blocks_image_size}") for i, block_args in enumerate(reversed(self._blocks_args)): image_size = self._blocks_image_size[i] # Update block input and output filters based on depth multiplier. @@ -492,7 +487,6 @@ def __init__(self, blocks_args=None, global_params=None): output_filters=round_filters(block_args.input_filters, self._global_params), num_repeat=round_repeats(block_args.num_repeat, self._global_params) ) - # print(f"input filter: {block_args.input_filters}, output filter: {block_args.output_filters}") # The first block needs to take care of stride and filter size increase. self._decoder_blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size, decoder_mode=True, decoder_output_image_size=self._blocks_image_size[i+1])) @@ -523,7 +517,6 @@ def extract_features(self, inputs): layer in the efficientnet model. """ x = super().extract_features(inputs) - # print(f"before downsample size: {x.size()}") x = self._swish(self._downsample_bn(self._feature_downsample(x))) return x @@ -541,10 +534,8 @@ def decode_features(self, inputs): """ # upsample x = self._swish(self._upsample_bn(self._feature_upsample(inputs))) - # print(f"after upsample size: {x.size()}") # Stem x = self._swish(self._decoder_bn0(self._decoder_conv_stem(x))) - # print(f"after decoder_conv_stem: {x.size()}") # Blocks for idx, block in enumerate(self._decoder_blocks): drop_connect_rate = self._global_params.drop_connect_rate @@ -552,11 +543,9 @@ def decode_features(self, inputs): # scale drop connect_rate drop_connect_rate *= float(idx) / len(self._blocks) x = block(x, drop_connect_rate=drop_connect_rate) - # print(f"after block: {x.size()}") # Head x = self._swish(self._decoder_bn1(self._decoder_conv_head(x))) - # print(f"after decoder_conv_head: {x.size()}") return x @@ -572,14 +561,11 @@ def forward(self, inputs): (AE output tensor, latent representation tensor) """ # Convolution layers - # print(f"input size: {inputs.size()}") x = self.extract_features(inputs) # Pooling and final linear layer latent_rep = x.flatten(start_dim=1) - # print(latent_rep.size()) # Deconvolution - decoder x = self.decode_features(x) - # print(f"final output size: {x.size()}") return x, latent_rep \ No newline at end of file diff --git a/efficientnet_pytorch/utils.py b/efficientnet_pytorch/utils.py index 60d461c..42bc656 100644 --- a/efficientnet_pytorch/utils.py +++ b/efficientnet_pytorch/utils.py @@ -321,7 +321,6 @@ def __init__(self, in_channels, out_channels, kernel_size, image_size, stride=1, self._actual_ow = actual_ow def forward(self, x): - # print(f" - Transposed2dStaticPadding input:{x.size()} expected:{self._ih, self._iw}") # assert x.size()[-2:] == (self._ih, self._iw) x = F.conv_transpose2d(x, self.weight, self.bias, self.stride, self.padding, self.output_padding, self.groups, self.dilation)