add UNet3P (PaddlePaddle#906)

geoyee · web-flow · commit d1b4fac89da0 · 2021-03-24T09:51:38.000+08:00
diff --git a/README.md b/README.md
@@ -42,6 +42,7 @@ Welcome to PaddleSeg! PaddleSeg is an end-to-end image segmentation development
 |[U<sup>2</sup>-Net](./configs/u2net)|-|-|-|-|
 |[Att U-Net](./configs/attention_unet)|-|-|-|-|
 |[U-Net++](./configs/unet_plusplus)|-|-|-|-|
+|[U-Net3+](./configs/unet_3plus)|-|-|-|-|
 |[DecoupledSegNet](./configs/decoupled_segnet)|✔|✔|||
 |[EMANet](./configs/emanet)|✔|✔|-|-|
 |[ISANet](./configs/isanet)|✔|✔|-|-|
diff --git a/README_CN.md b/README_CN.md
@@ -42,6 +42,7 @@ PaddleSeg是基于飞桨[PaddlePaddle](https://www.paddlepaddle.org.cn)开发的
 |[U<sup>2</sup>-Net](./configs/u2net)|-|-|-|-|
 |[Att U-Net](./configs/attention_unet)|-|-|-|-|
 |[U-Net++](./configs/unet_plusplus)|-|-|-|-|
+|[U-Net3+](./configs/unet_3plus)|-|-|-|-|
 |[DecoupledSegNet](./configs/decoupled_segnet)|✔|✔|||
 |[EMANet](./configs/emanet)|✔|✔|-|-|
 |[ISANet](./configs/isanet)|✔|✔|-|-|
diff --git a/configs/unet_3plus/README.md b/configs/unet_3plus/README.md
@@ -0,0 +1,5 @@
+# UNet 3+: A Full-Scale Connected UNet for Medical Image Segmentation
+
+## Reference
+
+> Huang H ,  Lin L ,  Tong R , et al. UNet 3+: A Full-Scale Connected UNet for Medical Image Segmentation[J]. ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2020.
diff --git a/configs/unet_3plus/unet_3plus_cityscapes_1024x512_160k.yml b/configs/unet_3plus/unet_3plus_cityscapes_1024x512_160k.yml
@@ -0,0 +1,12 @@
+_base_: '../_base_/cityscapes.yml'
+
+batch_size: 4
+iters: 160000
+
+model:
+  type: UNet3Plus
+  in_channels: 3
+  num_classes: 19
+  is_batchnorm: True
+  is_deepsup: False
+  is_CGM: False
diff --git a/docs/apis/models.md b/docs/apis/models.md
@@ -18,6 +18,7 @@ The models subpackage contains the following model for image sementic segmentaio
 - [U<sup>2</sup>Net+](#U2Net-1)
 - [AttentionUNet](#AttentionUNet)
 - [UNet++](#UNet-1)
+- [UNet3+](#UNet-2)
 - [DecoupledSegNet](#DecoupledSegNet)
 - [ISANet](#ISANet)
 - [EMANet](#EMANet)
@@ -408,6 +409,27 @@ The models subpackage contains the following model for image sementic segmentaio
 > > > - **pretrained** (str, optional): The path or url of pretrained model for fine tuning. Default: None.
 > > > - **is_ds** (bool): use deep supervision or not. Default: True
 
+## <span id="UNet-2">[UNet3+](../../paddleseg/models/unet_3plus.py)</span>
+> class UNet3Plus(in_channels,
+                 num_classes,
+                 is_batchnorm=True,
+                 is_deepsup=False,
+                 is_CGM=False)
+
+    The UNet3+ implementation based on PaddlePaddle.
+
+    The original article refers to
+    Huang H , Lin L , Tong R , et al. "UNet 3+: A Full-Scale Connected UNet for Medical Image Segmentation"
+    (https://arxiv.org/abs/2004.08790).
+
+> > Args
+> > > - **in_channels** (int): The channel number of input image.
+> > > - **num_classes** (int): The unique number of target classes.
+> > > - **is_batchnorm** (bool, optional) Use batchnorm after conv or not.  Default: True.
+> > > - **is_deepsup** (bool, optional): Use deep supervision or not.  Default: False.
+> > > - **is_CGM** (bool, optional): Use classification-guided module or not.
+            If True, is_deepsup must be True.  Default: False.
+
 ## [DecoupledSegNet](../../paddleseg/models/decoupled_segnet.py)
 > class DecoupledSegNet(num_classes,
                  backbone,
diff --git a/paddleseg/models/__init__.py b/paddleseg/models/__init__.py
@@ -30,6 +30,7 @@
 from .u2net import U2Net, U2Netp
 from .attention_unet import AttentionUNet
 from .unet_plusplus import UNetPlusPlus
+from .unet_3plus import UNet3Plus
 from .decoupled_segnet import DecoupledSegNet
 from .emanet import *
 from .isanet import *
diff --git a/paddleseg/models/unet_3plus.py b/paddleseg/models/unet_3plus.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg.cvlibs import manager
+from paddleseg.models.layers.layer_libs import SyncBatchNorm
+from paddleseg.cvlibs.param_init import kaiming_normal_init
+
+
+@manager.MODELS.add_component
+class UNet3Plus(nn.Layer):
+    """
+    The UNet3+ implementation based on PaddlePaddle.
+
+    The original article refers to
+    Huang H , Lin L , Tong R , et al. "UNet 3+: A Full-Scale Connected UNet for Medical Image Segmentation"
+    (https://arxiv.org/abs/2004.08790).
+
+    Args:
+        in_channels (int, optional): The channel number of input image.  Default: 3.
+        num_classes (int, optional): The unique number of target classes.  Default: 2.
+        is_batchnorm (bool, optional): Use batchnorm after conv or not.  Default: True.
+        is_deepsup (bool, optional): Use deep supervision or not.  Default: False.
+        is_CGM (bool, optional): Use classification-guided module or not.
+            If True, is_deepsup must be True.  Default: False.
+    """
+    def __init__(self, in_channels=3, num_classes=2, is_batchnorm=True, is_deepsup=False, is_CGM=False):
+        super(UNet3Plus, self).__init__()
+        # parameters
+        self.is_deepsup = True if is_CGM else is_deepsup
+        self.is_CGM = is_CGM
+        # internal definition
+        self.filters = [64, 128, 256, 512, 1024]
+        self.cat_channels = self.filters[0]
+        self.cat_blocks = 5
+        self.up_channels = self.cat_channels * self.cat_blocks
+        # layers
+        self.encoder = Encoder(in_channels, self.filters, is_batchnorm)
+        self.decoder = Decoder(self.filters, self.cat_channels, self.up_channels)
+        if self.is_deepsup:
+            self.deepsup = DeepSup(self.up_channels, self.filters, num_classes)
+            if self.is_CGM:
+                self.cls = nn.Sequential(nn.Dropout(p=0.5),
+                                         nn.Conv2D(self.filters[4], 2, 1),
+                                         nn.AdaptiveMaxPool2D(1),
+                                         nn.Sigmoid())
+        else:
+            self.outconv1 = nn.Conv2D(self.up_channels, num_classes, 3, padding=1)
+        # initialise weights
+        for sublayer in self.sublayers ():
+            if isinstance(sublayer, nn.Conv2D):
+                kaiming_normal_init(sublayer.weight)
+            elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                kaiming_normal_init(sublayer.weight)
+                
+    def dotProduct(self, seg, cls):
+        B, N, H, W = seg.shape
+        seg = seg.reshape((B, N, H * W))
+        clssp = paddle.ones([1, N])
+        ecls = (cls * clssp).reshape([B, N, 1])
+        final = seg * ecls
+        final = final.reshape((B, N, H, W))
+        return final
+        
+    def forward(self, inputs):
+        hs = self.encoder(inputs)
+        hds = self.decoder(hs)
+        if self.is_deepsup:
+            out = self.deepsup(hds)
+            if self.is_CGM:
+                # classification-guided module
+                cls_branch = self.cls(hds[-1]).squeeze(3).squeeze(2)  # (B,N,1,1)->(B,N)
+                cls_branch_max = cls_branch.argmax(axis=1)
+                cls_branch_max = cls_branch_max.reshape((-1, 1)).astype('float')
+                out = [self.dotProduct(d, cls_branch_max) for d in out]
+        else:
+            out = [self.outconv1(hds[0])]  # d1->320*320*num_classes
+        return out
+
+
+class Encoder(nn.Layer):
+    def __init__(self, in_channels, filters, is_batchnorm):
+        super(Encoder, self).__init__()
+        self.conv1 = UnetConv2D(in_channels, filters[0], is_batchnorm)
+        self.poolconv2 = MaxPoolConv2D(filters[0],  filters[1], is_batchnorm)
+        self.poolconv3 = MaxPoolConv2D(filters[1],  filters[2], is_batchnorm)
+        self.poolconv4 = MaxPoolConv2D(filters[2],  filters[3], is_batchnorm)
+        self.poolconv5 = MaxPoolConv2D(filters[3],  filters[4], is_batchnorm)
+        
+    def forward(self, inputs):
+        h1 = self.conv1(inputs)  # h1->320*320*64
+        h2 = self.poolconv2(h1)  # h2->160*160*128
+        h3 = self.poolconv3(h2)  # h3->80*80*256
+        h4 = self.poolconv4(h3)  # h4->40*40*512
+        hd5 = self.poolconv5(h4)  # h5->20*20*1024
+        return [h1, h2, h3, h4, hd5]
+
+
+class Decoder(nn.Layer):
+    def __init__(self, filters, cat_channels, up_channels):
+        super(Decoder, self).__init__()
+        '''stage 4d'''
+        # h1->320*320, hd4->40*40, Pooling 8 times
+        self.h1_PT_hd4 = nn.MaxPool2D(8, 8, ceil_mode=True)
+        self.h1_PT_hd4_cbr = ConvBnReLU2D(filters[0], cat_channels)
+        # h2->160*160, hd4->40*40, Pooling 4 times
+        self.h2_PT_hd4 = nn.MaxPool2D(4, 4, ceil_mode=True)
+        self.h2_PT_hd4_cbr = ConvBnReLU2D(filters[1], cat_channels)
+        # h3->80*80, hd4->40*40, Pooling 2 times
+        self.h3_PT_hd4 = nn.MaxPool2D(2, 2, ceil_mode=True)
+        self.h3_PT_hd4_cbr = ConvBnReLU2D(filters[2], cat_channels)
+        # h4->40*40, hd4->40*40, Concatenation
+        self.h4_Cat_hd4_cbr = ConvBnReLU2D(filters[3], cat_channels)
+        # hd5->20*20, hd4->40*40, Upsample 2 times
+        self.hd5_UT_hd4 = nn.Upsample(scale_factor=2, mode='bilinear')  # 14*14
+        self.hd5_UT_hd4_cbr = ConvBnReLU2D(filters[4], cat_channels)
+        # fusion(h1_PT_hd4, h2_PT_hd4, h3_PT_hd4, h4_Cat_hd4, hd5_UT_hd4)
+        self.cbr4d_1 = ConvBnReLU2D(up_channels, up_channels)  # 16
+        '''stage 3d'''
+        # h1->320*320, hd3->80*80, Pooling 4 times
+        self.h1_PT_hd3 = nn.MaxPool2D(4, 4, ceil_mode=True)
+        self.h1_PT_hd3_cbr = ConvBnReLU2D(filters[0], cat_channels)
+        # h2->160*160, hd3->80*80, Pooling 2 times
+        self.h2_PT_hd3 = nn.MaxPool2D(2, 2, ceil_mode=True)
+        self.h2_PT_hd3_cbr = ConvBnReLU2D(filters[1], cat_channels)
+        # h3->80*80, hd3->80*80, Concatenation
+        self.h3_Cat_hd3_cbr = ConvBnReLU2D(filters[2], cat_channels)
+        # hd4->40*40, hd4->80*80, Upsample 2 times
+        self.hd4_UT_hd3 = nn.Upsample(scale_factor=2, mode='bilinear')  # 14*14
+        self.hd4_UT_hd3_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd5->20*20, hd4->80*80, Upsample 4 times
+        self.hd5_UT_hd3 = nn.Upsample(scale_factor=4, mode='bilinear')  # 14*14
+        self.hd5_UT_hd3_cbr = ConvBnReLU2D(filters[4], cat_channels)
+        # fusion(h1_PT_hd3, h2_PT_hd3, h3_Cat_hd3, hd4_UT_hd3, hd5_UT_hd3)
+        self.cbr3d_1 = ConvBnReLU2D(up_channels, up_channels)  # 16
+        '''stage 2d '''
+        # h1->320*320, hd2->160*160, Pooling 2 times
+        self.h1_PT_hd2 = nn.MaxPool2D(2, 2, ceil_mode=True)
+        self.h1_PT_hd2_cbr = ConvBnReLU2D(filters[0], cat_channels)
+        # h2->160*160, hd2->160*160, Concatenation
+        self.h2_Cat_hd2_cbr = ConvBnReLU2D(filters[1], cat_channels)
+        # hd3->80*80, hd2->160*160, Upsample 2 times
+        self.hd3_UT_hd2 = nn.Upsample(scale_factor=2, mode='bilinear')  # 14*14
+        self.hd3_UT_hd2_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd4->40*40, hd2->160*160, Upsample 4 times
+        self.hd4_UT_hd2 = nn.Upsample(scale_factor=4, mode='bilinear')  # 14*14
+        self.hd4_UT_hd2_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd5->20*20, hd2->160*160, Upsample 8 times
+        self.hd5_UT_hd2 = nn.Upsample(scale_factor=8, mode='bilinear')  # 14*14
+        self.hd5_UT_hd2_cbr = ConvBnReLU2D(filters[4], cat_channels)
+        # fusion(h1_PT_hd2, h2_Cat_hd2, hd3_UT_hd2, hd4_UT_hd2, hd5_UT_hd2)
+        self.cbr2d_1 = ConvBnReLU2D(up_channels, up_channels)  # 16
+        '''stage 1d'''
+        # h1->320*320, hd1->320*320, Concatenation
+        self.h1_Cat_hd1_cbr = ConvBnReLU2D(filters[0], cat_channels)
+        # hd2->160*160, hd1->320*320, Upsample 2 times
+        self.hd2_UT_hd1 = nn.Upsample(scale_factor=2, mode='bilinear')  # 14*14
+        self.hd2_UT_hd1_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd3->80*80, hd1->320*320, Upsample 4 times
+        self.hd3_UT_hd1 = nn.Upsample(scale_factor=4, mode='bilinear')  # 14*14
+        self.hd3_UT_hd1_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd4->40*40, hd1->320*320, Upsample 8 times
+        self.hd4_UT_hd1 = nn.Upsample(scale_factor=8, mode='bilinear')  # 14*14
+        self.hd4_UT_hd1_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd5->20*20, hd1->320*320, Upsample 16 times
+        self.hd5_UT_hd1 = nn.Upsample(scale_factor=16, mode='bilinear')  # 14*14
+        self.hd5_UT_hd1_cbr = ConvBnReLU2D(filters[4], cat_channels)
+        # fusion(h1_Cat_hd1, hd2_UT_hd1, hd3_UT_hd1, hd4_UT_hd1, hd5_UT_hd1)
+        self.cbr1d_1 = ConvBnReLU2D(up_channels, up_channels)  # 16
+
+    def forward(self, inputs):
+        h1, h2, h3, h4, hd5 = inputs
+        h1_PT_hd4 = self.h1_PT_hd4_cbr(self.h1_PT_hd4(h1))
+        h2_PT_hd4 = self.h2_PT_hd4_cbr(self.h2_PT_hd4(h2))
+        h3_PT_hd4 = self.h3_PT_hd4_cbr(self.h3_PT_hd4(h3))
+        h4_Cat_hd4 = self.h4_Cat_hd4_cbr(h4)
+        hd5_UT_hd4 = self.hd5_UT_hd4_cbr(self.hd5_UT_hd4(hd5))
+        # hd4->40*40*up_channels
+        hd4 = self.cbr4d_1(paddle.concat([h1_PT_hd4, h2_PT_hd4, h3_PT_hd4, h4_Cat_hd4, hd5_UT_hd4], 1))
+        h1_PT_hd3 = self.h1_PT_hd3_cbr(self.h1_PT_hd3(h1))
+        h2_PT_hd3 = self.h2_PT_hd3_cbr(self.h2_PT_hd3(h2))
+        h3_Cat_hd3 = self.h3_Cat_hd3_cbr(h3)
+        hd4_UT_hd3 = self.hd4_UT_hd3_cbr(self.hd4_UT_hd3(hd4))
+        hd5_UT_hd3 = self.hd5_UT_hd3_cbr(self.hd5_UT_hd3(hd5))
+        # hd3->80*80*up_channels
+        hd3 = self.cbr3d_1(paddle.concat([h1_PT_hd3, h2_PT_hd3, h3_Cat_hd3, hd4_UT_hd3, hd5_UT_hd3], 1))
+        h1_PT_hd2 = self.h1_PT_hd2_cbr(self.h1_PT_hd2(h1))
+        h2_Cat_hd2 = self.h2_Cat_hd2_cbr(h2)
+        hd3_UT_hd2 = self.hd3_UT_hd2_cbr(self.hd3_UT_hd2(hd3))
+        hd4_UT_hd2 = self.hd4_UT_hd2_cbr(self.hd4_UT_hd2(hd4))
+        hd5_UT_hd2 = self.hd5_UT_hd2_cbr(self.hd5_UT_hd2(hd5))
+        # hd2->160*160*up_channels
+        hd2 = self.cbr2d_1(paddle.concat([h1_PT_hd2, h2_Cat_hd2, hd3_UT_hd2, hd4_UT_hd2, hd5_UT_hd2], 1))
+        h1_Cat_hd1 = self.h1_Cat_hd1_cbr(h1)
+        hd2_UT_hd1 = self.hd2_UT_hd1_cbr(self.hd2_UT_hd1(hd2))
+        hd3_UT_hd1 = self.hd3_UT_hd1_cbr(self.hd3_UT_hd1(hd3))
+        hd4_UT_hd1 = self.hd4_UT_hd1_cbr(self.hd4_UT_hd1(hd4))
+        hd5_UT_hd1 = self.hd5_UT_hd1_cbr(self.hd5_UT_hd1(hd5))
+        # hd1->320*320*up_channels
+        hd1 = self.cbr1d_1(paddle.concat([h1_Cat_hd1, hd2_UT_hd1, hd3_UT_hd1, hd4_UT_hd1, hd5_UT_hd1], 1))
+        return [hd1, hd2, hd3, hd4, hd5]
+
+
+class DeepSup(nn.Layer):
+    def __init__(self, up_channels, filters, num_classes):
+        super(DeepSup, self).__init__()
+        self.convup5 = ConvUp2D(filters[4], num_classes, 16)
+        self.convup4 = ConvUp2D(up_channels, num_classes, 8)
+        self.convup3 = ConvUp2D(up_channels, num_classes, 4)
+        self.convup2 = ConvUp2D(up_channels, num_classes, 2)
+        self.outconv1 = nn.Conv2D(up_channels, num_classes, 3, padding=1)
+
+    def forward(self, inputs):
+        hd1, hd2, hd3, hd4, hd5 = inputs
+        d5 = self.convup5(hd5)  # 16->256
+        d4 = self.convup4(hd4)  # 32->256
+        d3 = self.convup3(hd3)  # 64->256
+        d2 = self.convup2(hd2)  # 128->256
+        d1 = self.outconv1(hd1)  # 256
+        return [d1, d2, d3, d4, d5]
+
+
+class ConvBnReLU2D(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBnReLU2D, self).__init__(
+            nn.Conv2D(in_channels, out_channels, 3, padding=1),
+            nn.BatchNorm(out_channels),
+            nn.ReLU()
+        )
+
+
+class ConvUp2D(nn.Sequential):
+    def __init__(self, in_channels, out_channels, scale_factor):
+        super(ConvUp2D, self).__init__(
+            nn.Conv2D(in_channels, out_channels, 3, padding=1),
+            nn.Upsample(scale_factor=scale_factor, mode='bilinear')
+        )
+
+
+class MaxPoolConv2D(nn.Sequential):
+    def __init__(self, in_channels, out_channels, is_batchnorm):
+        super(MaxPoolConv2D, self).__init__( 
+            nn.MaxPool2D(kernel_size=2),
+            UnetConv2D(in_channels, out_channels, is_batchnorm)
+        )
+
+
+class UnetConv2D(nn.Layer):
+    def __init__(self, in_channels, out_channels, is_batchnorm, num_conv=2, kernel_size=3, stride=1, padding=1):
+        super(UnetConv2D, self).__init__()
+        self.num_conv = num_conv
+        for i in range(num_conv):
+            conv = (nn.Sequential(nn.Conv2D(in_channels, out_channels, kernel_size, stride, padding),
+                                  nn.BatchNorm(out_channels),
+                                  nn.ReLU()) \
+                    if is_batchnorm else \
+                    nn.Sequential(nn.Conv2D(in_channels, out_channels, kernel_size, stride, padding),
+                                  nn.ReLU()))        
+            setattr(self, 'conv%d' % (i + 1), conv)
+            in_channels = out_channels
+        # initialise the blocks
+        for children in self.children():
+            children.weight_attr = paddle.framework.ParamAttr(initializer=paddle.nn.initializer.KaimingNormal)
+            children.bias_attr = paddle.framework.ParamAttr(initializer=paddle.nn.initializer.KaimingNormal)
+
+    def forward(self, inputs):
+        x = inputs
+        for i in range(self.num_conv):
+            conv = getattr(self, 'conv%d' % (i + 1))
+            x = conv(x)
+        return x