From 304dfe7e9063992a87625aa53bbc7c4570747f5c Mon Sep 17 00:00:00 2001
From: Ngai Tung Kwok <ngaitung.kwok@u.nus.edu>
Date: Thu, 31 Mar 2022 12:47:05 +0800
Subject: [PATCH] fix: correct the problem when fcn_mask_head takes in invalid
 bboxes with negative coordinates

---
 .../roi_heads/mask_heads/fcn_mask_head.py     | 180 ++++++++++--------
 1 file changed, 101 insertions(+), 79 deletions(-)

diff --git a/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py b/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
index 491c426..ee6917c 100644
--- a/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
+++ b/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
@@ -15,45 +15,52 @@
 BYTES_PER_FLOAT = 4
 # TODO: This memory limit may be too much or too little. It would be better to
 # determine it based on available resources.
-GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit
+GPU_MEM_LIMIT = 1024 ** 3  # 1 GB memory limit
 
 
 @HEADS.register_module()
 class FCNMaskHead(BaseModule):
-
-    def __init__(self,
-                 num_convs=4,
-                 roi_feat_size=14,
-                 in_channels=256,
-                 conv_kernel_size=3,
-                 conv_out_channels=256,
-                 num_classes=80,
-                 class_agnostic=False,
-                 upsample_cfg=dict(type='deconv', scale_factor=2),
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 loss_mask=dict(
-                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
-                 init_cfg=None):
-        assert init_cfg is None, 'To prevent abnormal initialization ' \
-                                 'behavior, init_cfg is not allowed to be set'
+    def __init__(
+        self,
+        num_convs=4,
+        roi_feat_size=14,
+        in_channels=256,
+        conv_kernel_size=3,
+        conv_out_channels=256,
+        num_classes=80,
+        class_agnostic=False,
+        upsample_cfg=dict(type="deconv", scale_factor=2),
+        conv_cfg=None,
+        norm_cfg=None,
+        loss_mask=dict(type="CrossEntropyLoss", use_mask=True, loss_weight=1.0),
+        init_cfg=None,
+    ):
+        assert init_cfg is None, (
+            "To prevent abnormal initialization "
+            "behavior, init_cfg is not allowed to be set"
+        )
         super(FCNMaskHead, self).__init__(init_cfg)
         self.upsample_cfg = upsample_cfg.copy()
-        if self.upsample_cfg['type'] not in [
-                None, 'deconv', 'nearest', 'bilinear', 'carafe'
+        if self.upsample_cfg["type"] not in [
+            None,
+            "deconv",
+            "nearest",
+            "bilinear",
+            "carafe",
         ]:
             raise ValueError(
                 f'Invalid upsample method {self.upsample_cfg["type"]}, '
                 'accepted methods are "deconv", "nearest", "bilinear", '
-                '"carafe"')
+                '"carafe"'
+            )
         self.num_convs = num_convs
         # WARN: roi_feat_size is reserved and not used
         self.roi_feat_size = _pair(roi_feat_size)
         self.in_channels = in_channels
         self.conv_kernel_size = conv_kernel_size
         self.conv_out_channels = conv_out_channels
-        self.upsample_method = self.upsample_cfg.get('type')
-        self.scale_factor = self.upsample_cfg.pop('scale_factor', None)
+        self.upsample_method = self.upsample_cfg.get("type")
+        self.scale_factor = self.upsample_cfg.pop("scale_factor", None)
         self.num_classes = num_classes
         self.class_agnostic = class_agnostic
         self.conv_cfg = conv_cfg
@@ -63,8 +70,7 @@ def __init__(self,
 
         self.convs = ModuleList()
         for i in range(self.num_convs):
-            in_channels = (
-                self.in_channels if i == 0 else self.conv_out_channels)
+            in_channels = self.in_channels if i == 0 else self.conv_out_channels
             padding = (self.conv_kernel_size - 1) // 2
             self.convs.append(
                 ConvModule(
@@ -74,37 +80,44 @@ def __init__(self,
                     padding=padding,
                     bias=True,
                     conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg))
+                    norm_cfg=norm_cfg,
+                )
+            )
         upsample_in_channels = (
-            self.conv_out_channels if self.num_convs > 0 else in_channels)
+            self.conv_out_channels if self.num_convs > 0 else in_channels
+        )
         upsample_cfg_ = self.upsample_cfg.copy()
         if self.upsample_method is None:
             self.upsample = None
-        elif self.upsample_method == 'deconv':
+        elif self.upsample_method == "deconv":
             upsample_cfg_.update(
                 in_channels=upsample_in_channels,
                 out_channels=self.conv_out_channels,
                 kernel_size=self.scale_factor,
-                stride=self.scale_factor)
+                stride=self.scale_factor,
+            )
             self.upsample = build_upsample_layer(upsample_cfg_)
-        elif self.upsample_method == 'carafe':
+        elif self.upsample_method == "carafe":
             upsample_cfg_.update(
-                channels=upsample_in_channels, scale_factor=self.scale_factor)
+                channels=upsample_in_channels, scale_factor=self.scale_factor
+            )
             self.upsample = build_upsample_layer(upsample_cfg_)
         else:
             # suppress warnings
-            align_corners = (None
-                             if self.upsample_method == 'nearest' else False)
+            align_corners = None if self.upsample_method == "nearest" else False
             upsample_cfg_.update(
                 scale_factor=self.scale_factor,
                 mode=self.upsample_method,
-                align_corners=align_corners)
+                align_corners=align_corners,
+            )
             self.upsample = build_upsample_layer(upsample_cfg_)
 
         out_channels = 1 if self.class_agnostic else self.num_classes
         logits_in_channel = (
             self.conv_out_channels
-            if self.upsample_method == 'deconv' else upsample_in_channels)
+            if self.upsample_method == "deconv"
+            else upsample_in_channels
+        )
         self.conv_logits = Conv2d(logits_in_channel, out_channels, 1)
         self.relu = nn.ReLU(inplace=True)
         self.debug_imgs = None
@@ -117,8 +130,7 @@ def init_weights(self):
             elif isinstance(m, CARAFEPack):
                 m.init_weights()
             else:
-                nn.init.kaiming_normal_(
-                    m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
                 nn.init.constant_(m.bias, 0)
 
     @auto_fp16()
@@ -127,21 +139,20 @@ def forward(self, x):
             x = conv(x)
         if self.upsample is not None:
             x = self.upsample(x)
-            if self.upsample_method == 'deconv':
+            if self.upsample_method == "deconv":
                 x = self.relu(x)
         mask_pred = self.conv_logits(x)
         return mask_pred
 
     def get_targets(self, sampling_results, gt_masks, rcnn_train_cfg):
         pos_proposals = [res.pos_bboxes for res in sampling_results]
-        pos_assigned_gt_inds = [
-            res.pos_assigned_gt_inds for res in sampling_results
-        ]
-        mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
-                                   gt_masks, rcnn_train_cfg)
+        pos_assigned_gt_inds = [res.pos_assigned_gt_inds for res in sampling_results]
+        mask_targets = mask_target(
+            pos_proposals, pos_assigned_gt_inds, gt_masks, rcnn_train_cfg
+        )
         return mask_targets
 
-    @force_fp32(apply_to=('mask_pred', ))
+    @force_fp32(apply_to=("mask_pred",))
     def loss(self, mask_pred, mask_targets, labels):
         """
         Example:
@@ -166,15 +177,25 @@ def loss(self, mask_pred, mask_targets, labels):
             loss_mask = mask_pred.sum()
         else:
             if self.class_agnostic:
-                loss_mask = self.loss_mask(mask_pred, mask_targets,
-                                           torch.zeros_like(labels))
+                loss_mask = self.loss_mask(
+                    mask_pred, mask_targets, torch.zeros_like(labels)
+                )
             else:
                 loss_mask = self.loss_mask(mask_pred, mask_targets, labels)
-        loss['loss_mask'] = loss_mask
+        loss["loss_mask"] = loss_mask
         return loss
 
-    def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg,
-                      ori_shape, scale_factor, rescale, format=True):
+    def get_seg_masks(
+        self,
+        mask_pred,
+        det_bboxes,
+        det_labels,
+        rcnn_test_cfg,
+        ori_shape,
+        scale_factor,
+        rescale,
+        format=True,
+    ):
         """Get segmentation masks from mask_pred and bboxes.
 
         Args:
@@ -228,8 +249,9 @@ class label c.
             mask_pred = det_bboxes.new_tensor(mask_pred)
 
         device = mask_pred.device
-        cls_segms = [[] for _ in range(self.num_classes)
-                     ]  # BG is not included in num_classes
+        cls_segms = [
+            [] for _ in range(self.num_classes)
+        ]  # BG is not included in num_classes
         bboxes = det_bboxes[:, :4]
         labels = det_labels
         # No need to consider rescale and scale_factor while exporting to ONNX
@@ -240,16 +262,12 @@ class label c.
                 img_h, img_w = ori_shape[:2]
             else:
                 if isinstance(scale_factor, float):
-                    img_h = np.round(ori_shape[0] * scale_factor).astype(
-                        np.int32)
-                    img_w = np.round(ori_shape[1] * scale_factor).astype(
-                        np.int32)
+                    img_h = np.round(ori_shape[0] * scale_factor).astype(np.int32)
+                    img_w = np.round(ori_shape[1] * scale_factor).astype(np.int32)
                 else:
                     w_scale, h_scale = scale_factor[0], scale_factor[1]
-                    img_h = np.round(ori_shape[0] * h_scale.item()).astype(
-                        np.int32)
-                    img_w = np.round(ori_shape[1] * w_scale.item()).astype(
-                        np.int32)
+                    img_h = np.round(ori_shape[0] * h_scale.item()).astype(np.int32)
+                    img_w = np.round(ori_shape[1] * w_scale.item()).astype(np.int32)
                 scale_factor = 1.0
 
             if not isinstance(scale_factor, (float, torch.Tensor)):
@@ -262,14 +280,12 @@ class label c.
             if not self.class_agnostic:
                 box_inds = torch.arange(mask_pred.shape[0])
                 mask_pred = mask_pred[box_inds, labels][:, None]
-            masks, _ = _do_paste_mask(
-                mask_pred, bboxes, img_h, img_w, skip_empty=False)
+            masks, _ = _do_paste_mask(mask_pred, bboxes, img_h, img_w, skip_empty=False)
             if threshold >= 0:
                 masks = (masks >= threshold).to(dtype=torch.bool)
             else:
                 # TensorRT backend does not have data type of uint8
-                is_trt_backend = os.environ.get(
-                    'ONNX_BACKEND') == 'MMCVTensorRT'
+                is_trt_backend = os.environ.get("ONNX_BACKEND") == "MMCVTensorRT"
                 target_dtype = torch.int32 if is_trt_backend else torch.uint8
                 masks = (masks * 255).to(dtype=target_dtype)
             return masks
@@ -277,7 +293,7 @@ class label c.
         N = len(mask_pred)
         # The actual implementation split the input into chunks,
         # and paste them chunk by chunk.
-        if device.type == 'cpu':
+        if device.type == "cpu":
             # CPU is most efficient when they are pasted one by one with
             # skip_empty=True, so that it performs minimal number of
             # operations.
@@ -286,9 +302,11 @@ class label c.
             # GPU benefits from parallelism for larger chunks,
             # but may have memory issue
             num_chunks = int(
-                np.ceil(N * img_h * img_w * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
-            assert (num_chunks <=
-                    N), 'Default GPU_MEM_LIMIT is too small; try increasing it'
+                np.ceil(N * img_h * img_w * BYTES_PER_FLOAT / GPU_MEM_LIMIT)
+            )
+            assert (
+                num_chunks <= N
+            ), "Default GPU_MEM_LIMIT is too small; try increasing it"
         chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
 
         threshold = rcnn_test_cfg.mask_thr_binary
@@ -297,7 +315,8 @@ class label c.
             img_h,
             img_w,
             device=device,
-            dtype=torch.bool if threshold >= 0 else torch.uint8)
+            dtype=torch.bool if threshold >= 0 else torch.uint8,
+        )
 
         if not self.class_agnostic:
             mask_pred = mask_pred[range(N), labels][:, None]
@@ -308,7 +327,8 @@ class label c.
                 bboxes[inds],
                 img_h,
                 img_w,
-                skip_empty=device.type == 'cpu')
+                skip_empty=device.type == "cpu",
+            )
 
             if threshold >= 0:
                 masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
@@ -316,7 +336,7 @@ class label c.
                 # for visualization and debugging
                 masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
 
-            im_mask[(inds, ) + spatial_inds] = masks_chunk
+            im_mask[(inds,) + spatial_inds] = masks_chunk
 
         for i in range(N):
             cls_segms[labels[i]].append(im_mask[i].detach().cpu().numpy())
@@ -353,13 +373,16 @@ def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True):
     # this has more operations but is faster on COCO-scale dataset.
     device = masks.device
     if skip_empty:
-        x0_int, y0_int = torch.clamp(
-            boxes.min(dim=0).values.floor()[:2] - 1,
-            min=0).to(dtype=torch.int32)
-        x1_int = torch.clamp(
-            boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
-        y1_int = torch.clamp(
-            boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+        x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to(
+            dtype=torch.int32
+        )
+        x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w, min=1).to(
+            dtype=torch.int32
+        )
+        y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h, min=1).to(
+            dtype=torch.int32
+        )
+
     else:
         x0_int, y0_int = 0, 0
         x1_int, y1_int = img_w, img_h
@@ -385,8 +408,7 @@ def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True):
     gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
     grid = torch.stack([gx, gy], dim=3)
 
-    img_masks = F.grid_sample(
-        masks.to(dtype=torch.float32), grid, align_corners=False)
+    img_masks = F.grid_sample(masks.to(dtype=torch.float32), grid, align_corners=False)
 
     if skip_empty:
         return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))