接續上面的RPN output

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
rpn_class_logits, rpn_class, rpn_bbox = outputs
# Generate proposals
# Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
# and zero padded.
# POST_NMS_ROIS_INFERENCE = 1000
# POST_NMS_ROIS_TRAINING = 2000
proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\
else config.POST_NMS_ROIS_INFERENCE
rpn_rois = ProposalLayer(
proposal_count=proposal_count,
nms_threshold=config.RPN_NMS_THRESHOLD,
name="ROI",
config=config)([rpn_class, rpn_bbox, anchors]
)

Anchor boxes generate

而 anchors 的來源呢
在主程式碼中有這段

1
2
3
4
5
6
7
8
9
10
# Anchors
if mode == "training":
anchors = self.get_anchors(config.IMAGE_SHAPE)
# Duplicate across the batch dimension because Keras requires it
# TODO: can this be optimized to avoid duplicating the anchors?
anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape)
# A hack to get around Keras's bad support for constants
anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image)
else:
anchors = input_anchors

def get_anchors

compute_backbone_shapes:

1
2
3
Computes the width and height of each stage of the backbone network.
Returns:
[N, (height, width)]. Where N is the number of stages

utils.generate_pyramid_anchors這邊看起來相對於single scale的backbone, 這裡多了backbone_shapes這項變數, 是因為需要知道該層feature_map大小才能回推anchor boxes

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
RPN_ANCHOR_SCALES = (32, 64, 128, 256, 512)
RPN_ANCHOR_RATIOS = [0.5, 1 ,2]
def get_anchors(self, image_shape):
"""Returns anchor pyramid for the given image size."""
backbone_shapes = compute_backbone_shapes(self.config, image_shape)
# Cache anchors and reuse if image shape is the same
if not hasattr(self, "_anchor_cache"):
self._anchor_cache = {}
if not tuple(image_shape) in self._anchor_cache:
# Generate Anchors
a = utils.generate_pyramid_anchors(
self.config.RPN_ANCHOR_SCALES,
self.config.RPN_ANCHOR_RATIOS,
backbone_shapes,
self.config.BACKBONE_STRIDES,
self.config.RPN_ANCHOR_STRIDE)
# Keep a copy of the latest anchors in pixel coordinates because
# it's used in inspect_model notebooks.
# TODO: Remove this after the notebook are refactored to not use it
self.anchors = a
# Normalize coordinates
self._anchor_cache[tuple(image_shape)] = utils.norm_boxes(a, image_shape[:2])
return self._anchor_cache[tuple(image_shape)]

utils.generate_pyramid_anchors

這邊很明顯就是包裝一層, 讓各個stage (P2~P6)的各自去計算以後append
後方操作…舉個例子, 順便了解一下IO, 其實只是會了整形而已

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
>>> a=np.arange(5)
>>> b=np.array([11,22,33])
>>> a
array([0, 1, 2, 3, 4])
>>> b
array([11, 22, 33])
>>> c.append(a)
>>> c
[array([0, 1, 2, 3, 4])]
>>> c.append(b)
>>> c
[array([0, 1, 2, 3, 4]), array([11, 22, 33])]
>>> np.concatenate(c,axis=0)
array([ 0, 1, 2, 3, 4, 11, 22, 33])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def generate_pyramid_anchors(scales, ratios, feature_shapes, feature_strides,anchor_stride):
"""Generate anchors at different levels of a feature pyramid. Each scale
is associated with a level of the pyramid, but each ratio is used in
all levels of the pyramid.
Returns:
anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted
with the same order of the given scales. So, anchors of scale[0] come
first, then anchors of scale[1], and so on.
"""
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
anchors = []
for i in range(len(scales)):
anchors.append(
generate_anchors(
scales[i], # self.config.RPN_ANCHOR_SCALES
ratios, # self.config.RPN_ANCHOR_RATIOS
feature_shapes[i], # backbone_shapes
feature_strides[i], # self.config.BACKBONE_STRIDES [4, 8, 16, 32, 64]
anchor_stride # self.config.RPN_ANCHOR_STRIDE
))
return np.concatenate(anchors, axis=0)

utils.generate_anchors

Code內其實有參數說明了
scales就是anchor boxes邊長了
ratio就是anchor boxes的比例
shapes是原圖
feature_stride 其實就是原圖與該feature層(例如p3)的縮放倍率, 如果是resnet或是VGG這種就一律都是縮放16倍了
anchor_stride 這通常是1吧XD 就是以縮放比切割原圖後可以得到(w/feature_stride)x(h/feature_stride)個grid, 那在此多少區間弄一個anchor?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def generate_anchors(scales, ratios, shape, feature_stride, anchor_stride):
"""
scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
shape: [height, width] spatial shape of the feature map over which
to generate anchors.
feature_stride: Stride of the feature map relative to the image in pixels.
anchor_stride: Stride of anchors on the feature map. For example, if the
value is 2 then generate anchors for every other feature map pixel.
"""
# Get all combinations of scales and ratios
scales, ratios = np.meshgrid(np.array(scales), np.array(ratios))
scales = scales.flatten()
ratios = ratios.flatten()
# Enumerate heights and widths from scales and ratios
heights = scales / np.sqrt(ratios)
widths = scales * np.sqrt(ratios)
# Enumerate shifts in feature space
shifts_y = np.arange(0, shape[0], anchor_stride) * feature_stride
shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
# Enumerate combinations of shifts, widths, and heights
box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = np.stack(
[box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
# Convert to corner coordinates (y1, x1, y2, x2)
boxes = np.concatenate([box_centers - 0.5 * box_sizes,
box_centers + 0.5 * box_sizes], axis=1)
return boxes

下面用一組參數示範一下, 到最後widths, heights 就可以得到anchor boxes的所有邊長了
np.meshgrid的部分, 要注意傳入參數得需要是一維的, 接著再用flatten()把它變成1xN的array
np.stack我覺得挺難理解, 可以看這邊 numpy.stack最通俗的理解

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# Get all combinations of scales and ratios
>>> scales = [32, 64, 128, 256, 512]
>>> ratios = [0.5,1,2]
>>> np.meshgrid(np.array(scales), np.array(ratios))
[
array([
[ 32, 64, 128, 256, 512],
[ 32, 64, 128, 256, 512],
[ 32, 64, 128, 256, 512]
]),
array([
[0.5, 0.5, 0.5, 0.5, 0.5],
[1. , 1. , 1. , 1. , 1. ],
[2. , 2. , 2. , 2. , 2. ]
])
]
### 反過來傳結果有點不一樣
>>> np.meshgrid(np.array(ratios), np.array(scales))
[array([[0.5, 1. , 2. ],
[0.5, 1. , 2. ],
[0.5, 1. , 2. ],
[0.5, 1. , 2. ],
[0.5, 1. , 2. ]]),
array([[ 32, 32, 32],
[ 64, 64, 64],
[128, 128, 128],
[256, 256, 256],
[512, 512, 512]])]
>>> scales = scales.flatten()
>>> ratios = ratios.flatten()
# Enumerate heights and widths from scales and ratios
>>> heights = scales / np.sqrt(ratios)
>>> widths = scales * np.sqrt(ratios)
>>> heights
array([ 45.254834 , 90.50966799, 181.01933598, 362.03867197,
724.07734394, 32. , 64. , 128. ,
256. , 512. , 22.627417 , 45.254834 ,
90.50966799, 181.01933598, 362.03867197])
>>> widths
array([ 22.627417 , 45.254834 , 90.50966799, 181.01933598,
362.03867197, 32. , 64. , 128. ,
256. , 512. , 45.254834 , 90.50966799,
181.01933598, 362.03867197, 724.07734394])
>>> heights.shape, widths.shape
((15,), (15,))
# Enumerate shifts in feature space
>>> anchor_stride=1
>>> shape=[1024,2048]
>>> feature_strides = [4, 8, 16, 32, 64]
>>> feature_stride = feature_strides[2]
>>> shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
>>> shifts_y = np.arange(0, shape[0], anchor_stride) * feature_stride
>>> shifts_x
array([ 0, 16, 32, ..., 32720, 32736, 32752])
>>> shifts_y
array([ 0, 16, 32, ..., 16336, 16352, 16368])
>>> len(shifts_x), len(shifts_y)
2048,1024
>>> shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
>>> shifts_y
array([[ 0, 0, 0, ..., 0, 0, 0],
[ 16, 16, 16, ..., 16, 16, 16],
[ 32, 32, 32, ..., 32, 32, 32],
...,
[16336, 16336, 16336, ..., 16336, 16336, 16336],
[16352, 16352, 16352, ..., 16352, 16352, 16352],
[16368, 16368, 16368, ..., 16368, 16368, 16368]])
>>> shifts_x
array([[ 0, 16, 32, ..., 32720, 32736, 32752],
[ 0, 16, 32, ..., 32720, 32736, 32752],
[ 0, 16, 32, ..., 32720, 32736, 32752],
...,
[ 0, 16, 32, ..., 32720, 32736, 32752],
[ 0, 16, 32, ..., 32720, 32736, 32752],
[ 0, 16, 32, ..., 32720, 32736, 32752]])
>>> shifts_x.shape, shifts_y.shape
((1024, 2048), (1024, 2048))
# Enumerate combinations of shifts, widths, and heights
>>> box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
>>> box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
>>> box_heights
array([[ 45.254834 , 90.50966799, 181.01933598, ..., 90.50966799,
181.01933598, 362.03867197],
[ 45.254834 , 90.50966799, 181.01933598, ..., 90.50966799,
181.01933598, 362.03867197],
[ 45.254834 , 90.50966799, 181.01933598, ..., 90.50966799,
181.01933598, 362.03867197],
...,
[ 45.254834 , 90.50966799, 181.01933598, ..., 90.50966799,
181.01933598, 362.03867197],
[ 45.254834 , 90.50966799, 181.01933598, ..., 90.50966799,
181.01933598, 362.03867197],
[ 45.254834 , 90.50966799, 181.01933598, ..., 90.50966799,
181.01933598, 362.03867197]])
>>> box_centers_y
array([[ 0, 0, 0, ..., 0, 0, 0],
[ 0, 0, 0, ..., 0, 0, 0],
[ 0, 0, 0, ..., 0, 0, 0],
...,
[16368, 16368, 16368, ..., 16368, 16368, 16368],
[16368, 16368, 16368, ..., 16368, 16368, 16368],
[16368, 16368, 16368, ..., 16368, 16368, 16368]])
>>> box_centers_x
array([[ 0, 0, 0, ..., 0, 0, 0],
[ 16, 16, 16, ..., 16, 16, 16],
[ 32, 32, 32, ..., 32, 32, 32],
...,
[32720, 32720, 32720, ..., 32720, 32720, 32720],
[32736, 32736, 32736, ..., 32736, 32736, 32736],
[32752, 32752, 32752, ..., 32752, 32752, 32752]])
>>> box_widths
array([[ 22.627417 , 45.254834 , 90.50966799, ..., 181.01933598,
362.03867197, 724.07734394],
[ 22.627417 , 45.254834 , 90.50966799, ..., 181.01933598,
362.03867197, 724.07734394],
[ 22.627417 , 45.254834 , 90.50966799, ..., 181.01933598,
362.03867197, 724.07734394],
...,
[ 22.627417 , 45.254834 , 90.50966799, ..., 181.01933598,
362.03867197, 724.07734394],
[ 22.627417 , 45.254834 , 90.50966799, ..., 181.01933598,
362.03867197, 724.07734394],
[ 22.627417 , 45.254834 , 90.50966799, ..., 181.01933598,
362.03867197, 724.07734394]])
3867197,>>> box_widths.shape, box_centers_x.shape, box_heights.shape , box_centers_y.shape
((2097152, 15), (2097152, 15), (2097152, 15), (2097152, 15))
# Reshape to get a list of (y, x) and a list of (h, w)
>>> box_centers
array([[ 0, 0],
[ 0, 0],
[ 0, 0],
...,
[16368, 32752],
[16368, 32752],
[16368, 32752]])
>>> box_sizes
array([[ 45.254834 , 22.627417 ],
[ 90.50966799, 45.254834 ],
[181.01933598, 90.50966799],
...,
[ 90.50966799, 181.01933598],
[181.01933598, 362.03867197],
[362.03867197, 724.07734394]])
>>> len(box_centers),len(box_sizes)
(31457280, 31457280)
>>> box_centers.shape, box_sizes.shape
((31457280, 2), (31457280, 2))
# Convert to corner coordinates (y1, x1, y2, x2)
>>> boxes = np.concatenate([box_centers - 0.5 * box_sizes,
... box_centers + 0.5 * box_sizes], axis=1)
>>> boxes
array([[-2.26274170e+01, -1.13137085e+01, 2.26274170e+01,
1.13137085e+01],
[-4.52548340e+01, -2.26274170e+01, 4.52548340e+01,
2.26274170e+01],
[-9.05096680e+01, -4.52548340e+01, 9.05096680e+01,
4.52548340e+01],
...,
[ 1.63227452e+04, 3.26614903e+04, 1.64132548e+04,
3.28425097e+04],
[ 1.62774903e+04, 3.25709807e+04, 1.64585097e+04,
3.29330193e+04],
[ 1.61869807e+04, 3.23899613e+04, 1.65490193e+04,
3.31140387e+04]])
>>> boxes.shape
(31457280, 4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
>>> np.broadcast_to(boxes, (4,) + boxes.shape)
array([[[-2.26274170e+01, -1.13137085e+01, 2.26274170e+01,
1.13137085e+01],
[-4.52548340e+01, -2.26274170e+01, 4.52548340e+01,
2.26274170e+01],
[-9.05096680e+01, -4.52548340e+01, 9.05096680e+01,
4.52548340e+01],
...,
[ 1.63227452e+04, 3.26614903e+04, 1.64132548e+04,
3.28425097e+04],
[ 1.62774903e+04, 3.25709807e+04, 1.64585097e+04,
3.29330193e+04],
[ 1.61869807e+04, 3.23899613e+04, 1.65490193e+04,
3.31140387e+04]],
[[-2.26274170e+01, -1.13137085e+01, 2.26274170e+01,
1.13137085e+01],
[-4.52548340e+01, -2.26274170e+01, 4.52548340e+01,
2.26274170e+01],
[-9.05096680e+01, -4.52548340e+01, 9.05096680e+01,
4.52548340e+01],
...,
[ 1.63227452e+04, 3.26614903e+04, 1.64132548e+04,
3.28425097e+04],
[ 1.62774903e+04, 3.25709807e+04, 1.64585097e+04,
3.29330193e+04],
[ 1.61869807e+04, 3.23899613e+04, 1.65490193e+04,
3.31140387e+04]],
[[-2.26274170e+01, -1.13137085e+01, 2.26274170e+01,
1.13137085e+01],
[-4.52548340e+01, -2.26274170e+01, 4.52548340e+01,
2.26274170e+01],
[-9.05096680e+01, -4.52548340e+01, 9.05096680e+01,
4.52548340e+01],
...,
[ 1.63227452e+04, 3.26614903e+04, 1.64132548e+04,
3.28425097e+04],
[ 1.62774903e+04, 3.25709807e+04, 1.64585097e+04,
3.29330193e+04],
[ 1.61869807e+04, 3.23899613e+04, 1.65490193e+04,
3.31140387e+04]],
[[-2.26274170e+01, -1.13137085e+01, 2.26274170e+01,
1.13137085e+01],
[-4.52548340e+01, -2.26274170e+01, 4.52548340e+01,
2.26274170e+01],
[-9.05096680e+01, -4.52548340e+01, 9.05096680e+01,
4.52548340e+01],
...,
[ 1.63227452e+04, 3.26614903e+04, 1.64132548e+04,
3.28425097e+04],
[ 1.62774903e+04, 3.25709807e+04, 1.64585097e+04,
3.29330193e+04],
[ 1.61869807e+04, 3.23899613e+04, 1.65490193e+04,
3.31140387e+04]]])
>>> np.broadcast_to(boxes, (4,) + boxes.shape).shape
(4, 31457280, 4)