Mask RCNNCode Reading for Proposal Layer

通過下面這段程式碼拿到anchors以後

# Anchors
if mode == "training":
  anchors = self.get_anchors(config.IMAGE_SHAPE)
  # Duplicate across the batch dimension because Keras requires it
  # TODO: can this be optimized to avoid duplicating the anchors?
  anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape)
  # A hack to get around Keras's bad support for constants
  anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image)
else:
  anchors = input_anchors

這裡總算要進行proposal layer計算了
將所有anchors還有相應對的分數跟偏移量當作輸入
而前面的東西都是class init的時候的設定

rpn_class_logits, rpn_class, rpn_bbox = outputs
# Generate proposals
# Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
# and zero padded.
# POST_NMS_ROIS_INFERENCE = 1000
# POST_NMS_ROIS_TRAINING = 2000
    proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\
            else config.POST_NMS_ROIS_INFERENCE
        
rpn_rois = ProposalLayer(
              proposal_count=proposal_count,
              nms_threshold=config.RPN_NMS_THRESHOLD,
              name="ROI",
              config=config)([rpn_class, rpn_bbox, anchors]
              )
if mode == "training":
      # Class ID mask to mark class IDs supported by the dataset the image
      # came from.
      active_class_ids = KL.Lambda(
          lambda x: parse_image_meta_graph(x)["active_class_ids"]
          )(input_image_meta)
    
      if not config.USE_RPN_ROIS:
          # Ignore predicted ROIs and use ROIs provided as an input.
          input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],
                                name="input_roi", dtype=np.int32)
          # Normalize coordinates
          target_rois = KL.Lambda(lambda x: norm_boxes_graph(
              x, K.shape(input_image)[1:3]))(input_rois)
      else:
          target_rois = rpn_rois

其中parse_image_meta_graph

return {
        "image_id": image_id,
        "original_image_shape": original_image_shape,
        "image_shape": image_shape,
        "window": window,
        "scale": scale,
        "active_class_ids": active_class_ids,
    }

Proposal Layer

rpn_class：所有像素點BG/FG的機率值。
rpn_bbox：所有像素點對應anchor上的4個偏移值[dy, dx, log(dh), log(dw)]。
anchors: 剛剛通過預先生成的有序anchor列表，注意這裡有序表示feature_map上像素點生成的anchor以及該像素點生成的rpn_class和rpn_bbox是對應的（看paper看起來是這樣,但這樣理解對嗎, 有點不確定)
scores和deltas都是RPN中得到的

最終ProposalLayer會return一個經過bbox regression以及NMS過濾後anchor boxes set(稱為roi或proposal)，至此已經完成了RPN啦~
init內super的用法可參考你不知道的 super

要注意的事情是, 其實對於inference來說這裡已經有RPN的結果了, 但對於training來說還少了一些東西, 就是跟grondtruth的比較還有等等, 所以才有後續的DetectionTargetLayer

另外Python中，如果在創建class的時候寫了call()，那麼該class實例化出實例後，實例名()就是調用call()。

class ProposalLayer(KE.Layer):
    """Receives anchor scores and selects a subset to pass as proposals
    to the second stage. Filtering is done based on anchor scores and
    non-max suppression to remove overlaps. It also applies bounding
    box refinement deltas to anchors.
    Inputs:
        rpn_probs: [batch, anchors, (bg prob, fg prob)]
        rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
        anchors: [batch, (y1, x1, y2, x2)] anchors in normalized coordinates
    Returns:
        Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
    """
    def __init__(self, proposal_count, nms_threshold, config=None, **kwargs):
        super(ProposalLayer, self).__init__(**kwargs)
        self.config = config
        self.proposal_count = proposal_count
        self.nms_threshold = nms_threshold
    def call(self, inputs):
        ###實現了將傳入的anchors，及其scores、deltas進行topK的推薦和nms的推薦，最終輸出  
        ###數量為proposal_counts的proposals。其中的scores和deltas都是RPN網絡中得到的
        # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
        scores = inputs[0][:, :, 1]
        # Box deltas [batch, num_rois, 4]
        deltas = inputs[1]
        deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4])
        # Anchors
        anchors = inputs[2]
        # Improve performance by trimming to top anchors by score
        # and doing the rest on the smaller subset.
        pre_nms_limit = tf.minimum(6000, tf.shape(anchors)[1])
        ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True,
                         name="top_anchors").indices
        scores = utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y),
                                   self.config.IMAGES_PER_GPU)
        deltas = utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y),
                                   self.config.IMAGES_PER_GPU)
        pre_nms_anchors = utils.batch_slice([anchors, ix], lambda a, x: tf.gather(a, x),
                                    self.config.IMAGES_PER_GPU,
                                    names=["pre_nms_anchors"])
        # Apply deltas to anchors to get refined anchors.
        # [batch, N, (y1, x1, y2, x2)]
        ##利用deltas在anchors上，得到精煉後的boxs  
        boxes = utils.batch_slice([pre_nms_anchors, deltas],
                                  lambda x, y: apply_box_deltas_graph(x, y),
                                  self.config.IMAGES_PER_GPU,
                                  names=["refined_anchors"])
                                  
        # normalized coordinates就是對應原圖的百分比坐標
        # 下面的作用：防止修正後的anchor坐標超出了邊界即0<=x,y<=1
        # Clip to image boundaries. Since we're in normalized coordinates,
        # clip to 0..1 range. [batch, N, (y1, x1, y2, x2)]
        window = np.array([0, 0, 1, 1], dtype=np.float32)
        boxes = utils.batch_slice(boxes,
                                  lambda x: clip_boxes_graph(x, window),
                                  self.config.IMAGES_PER_GPU,
                                  names=["refined_anchors_clipped"])
        # Filter out small boxes
        # According to Xinlei Chen's paper, this reduces detection accuracy
        # for small objects, so we're skipping it.
        # Non-max suppression
        def nms(boxes, scores):
            indices = tf.image.non_max_suppression(
                boxes, scores, self.proposal_count,
                self.nms_threshold, name="rpn_non_max_suppression")
            proposals = tf.gather(boxes, indices)
            # Pad if needed
            padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)
            ##利用deltas在anchors上，得到精化的boxs 
            proposals = tf.pad(proposals, [(0, padding), (0, 0)])
            return proposals
             
        proposals = utils.batch_slice([boxes, scores], nms,
                                      self.config.IMAGES_PER_GPU)
        return proposals
    def compute_output_shape(self, input_shape):
        return (None, self.proposal_count, 4)