Add timm efficientnet encoder (qubvel-org#189)

qubvel · web-flow · commit a3cc9ac7b343 · 2020-04-24T01:16:09.000+03:00
* Add efficientnet from timm
diff --git a/README.md b/README.md
@@ -120,6 +120,16 @@ preprocess_input = get_preprocessing_fn('resnet18', pretrained='imagenet')
 |efficientnet-b7                 |imagenet                        |63M                             |
 |mobilenet_v2                    |imagenet                        |2M                              |
 |xception                        |imagenet                        |22M                             |
+|timm-efficientnet-b0            |imagenet<br>advprop<br>noisy-student|4M                              |
+|timm-efficientnet-b1            |imagenet<br>advprop<br>noisy-student|6M                              |
+|timm-efficientnet-b2            |imagenet<br>advprop<br>noisy-student|7M                              |
+|timm-efficientnet-b3            |imagenet<br>advprop<br>noisy-student|10M                             |
+|timm-efficientnet-b4            |imagenet<br>advprop<br>noisy-student|17M                             |
+|timm-efficientnet-b5            |imagenet<br>advprop<br>noisy-student|28M                             |
+|timm-efficientnet-b6            |imagenet<br>advprop<br>noisy-student|40M                             |
+|timm-efficientnet-b7            |imagenet<br>advprop<br>noisy-student|63M                             |
+|timm-efficientnet-b8            |imagenet<br>advprop             |84M                             |
+|timm-efficientnet-l2            |noisy-student                   |474M                            |
 
 ### Models API <a name="api"></a>
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 torchvision>=0.3.0
 pretrainedmodels==0.7.4
 efficientnet-pytorch>=0.6.3
+timm==0.1.20
diff --git a/segmentation_models_pytorch/__init__.py b/segmentation_models_pytorch/__init__.py
@@ -2,7 +2,7 @@
 from .linknet import Linknet
 from .fpn import FPN
 from .pspnet import PSPNet
-from .deeplabv3 import DeepLabV3
+from .deeplabv3 import DeepLabV3, DeepLabV3Plus
 from .pan import PAN
 
 from . import encoders
diff --git a/segmentation_models_pytorch/deeplabv3/__init__.py b/segmentation_models_pytorch/deeplabv3/__init__.py
@@ -1 +1 @@
-from .model import DeepLabV3
+from .model import DeepLabV3, DeepLabV3Plus
diff --git a/segmentation_models_pytorch/deeplabv3/decoder.py b/segmentation_models_pytorch/deeplabv3/decoder.py
@@ -51,23 +51,99 @@ def forward(self, *features):
         return super().forward(features[-1])
 
 
+class DeepLabV3PlusDecoder(nn.Module):
+    def __init__(
+        self,
+        encoder_channels,
+        out_channels=256,
+        atrous_rates=(12, 24, 36),
+        output_stride=16,
+    ):
+        super().__init__()
+        if output_stride not in {8, 16}:
+            raise ValueError("Output stride should be 8 or 16, got {}.".format(output_stride))
+
+        self.out_channels = out_channels
+        self.output_stride = output_stride
+
+        self.aspp = nn.Sequential(
+            ASPP(encoder_channels[-1], out_channels, atrous_rates, separable=True),
+            SeparableConv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+
+        scale_factor = 2 if output_stride == 8 else 4
+        self.up = nn.UpsamplingBilinear2d(scale_factor=scale_factor)
+
+        highres_in_channels = encoder_channels[-4]
+        highres_out_channels = 48   # proposed by authors of paper
+        self.block1 = nn.Sequential(
+            nn.Conv2d(highres_in_channels, highres_out_channels, kernel_size=1, bias=False),
+            nn.BatchNorm2d(highres_out_channels),
+            nn.ReLU(),
+        )
+        self.block2 = nn.Sequential(
+            SeparableConv2d(
+                highres_out_channels + out_channels,
+                out_channels,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+
+    def forward(self, *features):
+        aspp_features = self.aspp(features[-1])
+        aspp_features = self.up(aspp_features)
+        high_res_features = self.block1(features[-4])
+        concat_features = torch.cat([aspp_features, high_res_features], dim=1)
+        fused_features = self.block2(concat_features)
+        return fused_features
+
+
 class ASPPConv(nn.Sequential):
     def __init__(self, in_channels, out_channels, dilation):
-        modules = [
-            nn.Conv2d(in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False),
+        super().__init__(
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                padding=dilation,
+                dilation=dilation,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+
+
+class ASPPSeparableConv(nn.Sequential):
+    def __init__(self, in_channels, out_channels, dilation):
+        super().__init__(
+            SeparableConv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                padding=dilation,
+                dilation=dilation,
+                bias=False,
+            ),
             nn.BatchNorm2d(out_channels),
-            nn.ReLU()
-        ]
-        super(ASPPConv, self).__init__(*modules)
+            nn.ReLU(),
+        )
 
 
 class ASPPPooling(nn.Sequential):
     def __init__(self, in_channels, out_channels):
-        super(ASPPPooling, self).__init__(
+        super().__init__(
             nn.AdaptiveAvgPool2d(1),
-            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
             nn.BatchNorm2d(out_channels),
-            nn.ReLU())
+            nn.ReLU(),
+        )
 
     def forward(self, x):
         size = x.shape[-2:]
@@ -77,31 +153,68 @@ def forward(self, x):
 
 
 class ASPP(nn.Module):
-    def __init__(self, in_channels, out_channels, atrous_rates):
+    def __init__(self, in_channels, out_channels, atrous_rates, separable=False):
         super(ASPP, self).__init__()
         modules = []
-        modules.append(nn.Sequential(
-            nn.Conv2d(in_channels, out_channels, 1, bias=False),
-            nn.BatchNorm2d(out_channels),
-            nn.ReLU()))
+        modules.append(
+            nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(),
+            )
+        )
 
         rate1, rate2, rate3 = tuple(atrous_rates)
-        modules.append(ASPPConv(in_channels, out_channels, rate1))
-        modules.append(ASPPConv(in_channels, out_channels, rate2))
-        modules.append(ASPPConv(in_channels, out_channels, rate3))
+        ASPPConvModule = ASPPConv if not separable else ASPPSeparableConv
+
+        modules.append(ASPPConvModule(in_channels, out_channels, rate1))
+        modules.append(ASPPConvModule(in_channels, out_channels, rate2))
+        modules.append(ASPPConvModule(in_channels, out_channels, rate3))
         modules.append(ASPPPooling(in_channels, out_channels))
 
         self.convs = nn.ModuleList(modules)
 
         self.project = nn.Sequential(
-            nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),
+            nn.Conv2d(5 * out_channels, out_channels, kernel_size=1, bias=False),
             nn.BatchNorm2d(out_channels),
             nn.ReLU(),
-            nn.Dropout(0.5))
+            nn.Dropout(0.5),
+        )
 
     def forward(self, x):
         res = []
         for conv in self.convs:
             res.append(conv(x))
         res = torch.cat(res, dim=1)
         return self.project(res)
+
+
+class SeparableConv2d(nn.Sequential):
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            bias=True,
+    ):
+        dephtwise_conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=False,
+        )
+        pointwise_conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            bias=bias,
+        )
+        super().__init__(dephtwise_conv, pointwise_conv)
diff --git a/segmentation_models_pytorch/deeplabv3/model.py b/segmentation_models_pytorch/deeplabv3/model.py
@@ -1,7 +1,7 @@
 import torch.nn as nn
 
 from typing import Optional
-from .decoder import DeepLabV3Decoder
+from .decoder import DeepLabV3Decoder, DeepLabV3PlusDecoder
 from ..base import SegmentationModel, SegmentationHead, ClassificationHead
 from ..encoders import get_encoder
 
@@ -79,3 +79,96 @@ def __init__(
             )
         else:
             self.classification_head = None
+
+
+class DeepLabV3Plus(SegmentationModel):
+    """DeepLabV3Plus_ implemetation from "Encoder-Decoder with Atrous Separable
+Convolution for Semantic Image Segmentation"
+    Args:
+        encoder_name: name of classification model (without last dense layers) used as feature
+                extractor to build segmentation model.
+        encoder_depth: number of stages used in decoder, larger depth - more features are generated.
+            e.g. for depth=3 encoder will generate list of features with following spatial shapes
+            [(H,W), (H/2, W/2), (H/4, W/4), (H/8, W/8)], so in general the deepest feature will have
+            spatial resolution (H/(2^depth), W/(2^depth)]
+        encoder_weights: one of ``None`` (random initialization), ``imagenet`` (pre-training on ImageNet).
+        encoder_output_stride: downsampling factor for deepest encoder features (see original paper for explanation)
+        decoder_atrous_rates: dilation rates for ASPP module (should be a tuple of 3 integer values)
+        decoder_channels: a number of convolution filters in ASPP module (default 256).
+        in_channels: number of input channels for model, default is 3.
+        classes: a number of classes for output (output shape - ``(batch, classes, h, w)``).
+        activation (str, callable): activation function used in ``.predict(x)`` method for inference.
+            One of [``sigmoid``, ``softmax2d``, callable, None]
+        upsampling: optional, final upsampling factor
+            (default is 8 to preserve input -> output spatial shape identity)
+        aux_params: if specified model will have additional classification auxiliary output
+            build on top of encoder, supported params:
+                - classes (int): number of classes
+                - pooling (str): one of 'max', 'avg'. Default is 'avg'.
+                - dropout (float): dropout factor in [0, 1)
+                - activation (str): activation function to apply "sigmoid"/"softmax" (could be None to return logits)
+    Returns:
+        ``torch.nn.Module``: **DeepLabV3Plus**
+    .. _DeeplabV3Plus:
+        https://arxiv.org/abs/1802.02611v3
+    """
+    def __init__(
+            self,
+            encoder_name: str = "resnet34",
+            encoder_depth: int = 5,
+            encoder_weights: Optional[str] = "imagenet",
+            encoder_output_stride: int = 16,
+            decoder_channels: int = 256,
+            decoder_atrous_rates: tuple = (12, 24, 36),
+            in_channels: int = 3,
+            classes: int = 1,
+            activation: Optional[str] = None,
+            upsampling: int = 4,
+            aux_params: Optional[dict] = None,
+    ):
+        super().__init__()
+
+        self.encoder = get_encoder(
+            encoder_name,
+            in_channels=in_channels,
+            depth=encoder_depth,
+            weights=encoder_weights,
+        )
+
+        if encoder_output_stride == 8:
+            self.encoder.make_dilated(
+                stage_list=[4, 5],
+                dilation_list=[2, 4]
+            )
+
+        elif encoder_output_stride == 16:
+            self.encoder.make_dilated(
+                stage_list=[5],
+                dilation_list=[2]
+            )
+        else:
+            raise ValueError(
+                "Encoder output stride should be 8 or 16, got {}".format(encoder_output_stride)
+            )
+
+        self.decoder = DeepLabV3PlusDecoder(
+            encoder_channels=self.encoder.out_channels,
+            out_channels=decoder_channels,
+            atrous_rates=decoder_atrous_rates,
+            output_stride=encoder_output_stride,
+        )
+
+        self.segmentation_head = SegmentationHead(
+            in_channels=self.decoder.out_channels,
+            out_channels=classes,
+            activation=activation,
+            kernel_size=1,
+            upsampling=upsampling,
+        )
+
+        if aux_params is not None:
+            self.classification_head = ClassificationHead(
+                in_channels=self.encoder.out_channels[-1], **aux_params
+            )
+        else:
+            self.classification_head = None
diff --git a/segmentation_models_pytorch/encoders/__init__.py b/segmentation_models_pytorch/encoders/__init__.py
@@ -11,7 +11,7 @@
 from .efficientnet import efficient_net_encoders
 from .mobilenet import mobilenet_encoders
 from .xception import xception_encoders
-
+from .timm_efficientnet import timm_efficientnet_encoders
 
 from ._preprocessing import preprocess_input
 
@@ -26,6 +26,7 @@
 encoders.update(efficient_net_encoders)
 encoders.update(mobilenet_encoders)
 encoders.update(xception_encoders)
+encoders.update(timm_efficientnet_encoders)
 
 
 def get_encoder(name, in_channels=3, depth=5, weights=None):
diff --git a/segmentation_models_pytorch/encoders/timm_efficientnet.py b/segmentation_models_pytorch/encoders/timm_efficientnet.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .model import DeepLabV3`
	`1`	`+from .model import DeepLabV3, DeepLabV3Plus`