blob: 94bf216581519df765f5c131508b7e93e90ebde1 [file] [log] [blame]
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// SSD box decoding and extracting
#include "ssd_postprocess/box.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static SsdParams params = {
.num_layers = 4,
.num_boxes = 1602,
.input_height = 320,
.input_width = 320,
.global_scales = {10, 10, 5, 5}, // y, x, h, w
.box_zero_points = {115, 129, 125, 119},
.box_scales = {0.0813235, 0.0786732, 0.0687513, 0.0522251},
.score_zero_points = {211, 195, 200, 225},
.score_scales = {0.177373, 0.121247, 0.100491, 0.0550178},
.score_threshold = 0.5,
.anchors_per_cell = 3,
.anchor_base_size = {24.0, 32.0, 40.0, 48.0, 64.0, 80.0, 96.0, 128.0, 160.0,
192.0, 256.0, 320.0},
.anchor_stride = {16, 32, 64, 128}};
// Set SSD parameters
void set_params(SsdParams* params_in) { params = *params_in; }
static inline float dequantize(int val, int zero_point, float scale) {
return scale * (val - zero_point);
}
static inline float sigmoid(float val) { return 1.0 / (1.0 + expf(-val)); }
// Generate model anchors
// layer0: 20 * 20 * 3 = 1200
// layer1: 10 * 10 * 3 = 300
// layer2: 5 * 5 * 3 = 75
// layer3: 3 * 3 * 3 = 27
// total sum: 1602
static void generate_anchors(BoxCenterEncode* anchors) {
int idx = 0;
for (int layer = 0; layer < params.num_layers; ++layer) {
int height_size = (params.input_height + params.anchor_stride[layer] - 1) /
params.anchor_stride[layer];
int width_size = (params.input_width + params.anchor_stride[layer] - 1) /
params.anchor_stride[layer];
for (int h = 0; h < height_size; h++) {
for (int w = 0; w < width_size; w++) {
for (int base = 0; base < params.anchors_per_cell; ++base) {
anchors[idx].y =
(float)params.anchor_stride[layer] * h / params.input_height;
anchors[idx].x =
(float)params.anchor_stride[layer] * w / params.input_width;
anchors[idx].h =
params.anchor_base_size[layer * params.anchors_per_cell + base] /
params.input_height;
anchors[idx].w =
params.anchor_base_size[layer * params.anchors_per_cell + base] /
params.input_width;
idx++;
}
}
}
}
}
// Decode boxes (with score) from model inference outputs
// The locations channel dim is 16 x 3.
// Each 16 is composed of (4 box coordinates + 6 * 2 landmarks coordinates).
// We need only the first 4 box coordinates - so want to keep only indexes:
// 0, 1, 2, 3
// 16,17,18,19
// 32,33,34,35
static void decode_boxes(uint8_t** model_out, BoxCenterEncode* boxes) {
const int num_coordinates = 16;
int box_idx = 0;
for (int layer = 0; layer < params.num_layers; layer++) {
int height_size = (params.input_height + params.anchor_stride[layer] - 1) /
params.anchor_stride[layer];
int width_size = (params.input_width + params.anchor_stride[layer] - 1) /
params.anchor_stride[layer];
// Boxes at even indicees; scores at odd indices
uint8_t* boxes_out = model_out[2 * layer];
uint8_t* scores_out = model_out[2 * layer + 1];
for (int i = 0; i < height_size * width_size; i++) {
for (int j = 0; j < params.anchors_per_cell; j++) {
int score_idx = i * params.anchors_per_cell + j;
int chan_idx = num_coordinates * score_idx;
// dequantize box
boxes[box_idx].y =
dequantize(boxes_out[chan_idx], params.box_zero_points[layer],
params.box_scales[layer]);
boxes[box_idx].x =
dequantize(boxes_out[chan_idx + 1], params.box_zero_points[layer],
params.box_scales[layer]);
boxes[box_idx].h =
dequantize(boxes_out[chan_idx + 2], params.box_zero_points[layer],
params.box_scales[layer]);
boxes[box_idx].w =
dequantize(boxes_out[chan_idx + 3], params.box_zero_points[layer],
params.box_scales[layer]);
// dequantize score
float dequant_score =
dequantize(scores_out[score_idx], params.score_zero_points[layer],
params.score_scales[layer]);
boxes[box_idx].score = sigmoid(dequant_score);
box_idx++;
}
}
}
}
// Convert box from center encoding to corner encoding format
static void convert_box(const BoxCenterEncode* box_in, BoxCenterEncode* anchor,
BoxCornerEncode* box_out) {
float y_center = box_in->y / params.global_scales[0] * anchor->h + anchor->y;
float x_center = box_in->x / params.global_scales[1] * anchor->w + anchor->x;
float half_h = 0.5 * expf(box_in->h / params.global_scales[2]) * anchor->h;
float half_w = 0.5 * expf(box_in->w / params.global_scales[3]) * anchor->w;
box_out->ymin = y_center - half_h;
box_out->xmin = x_center - half_w;
box_out->ymax = y_center + half_h;
box_out->xmax = x_center + half_w;
box_out->score = box_in->score;
}
// Detect boxes by score thresholding
static void detect_boxes(const BoxCenterEncode* boxes_in,
BoxCenterEncode* anchors, Boxes* boxes_out) {
int num_detected_boxes = 0;
for (int i = 0; i < params.num_boxes; ++i) {
if (boxes_in[i].score > params.score_threshold) {
num_detected_boxes++;
}
}
if (!(boxes_out->box)) {
boxes_out->box =
(BoxCornerEncode*)malloc(sizeof(BoxCornerEncode) * num_detected_boxes);
}
num_detected_boxes = 0;
for (int i = 0; i < params.num_boxes; ++i) {
if (boxes_in[i].score > params.score_threshold) {
convert_box(&(boxes_in[i]), &(anchors[i]),
&(boxes_out->box[num_detected_boxes]));
num_detected_boxes++;
}
}
boxes_out->num_boxes = num_detected_boxes;
}
// Decode and extract detected boxes
void get_detected_boxes(uint8_t** model_out, Boxes* boxes_out) {
BoxCenterEncode* boxes_in =
(BoxCenterEncode*)malloc(sizeof(BoxCenterEncode) * params.num_boxes);
BoxCenterEncode* anchors =
(BoxCenterEncode*)malloc(sizeof(BoxCenterEncode) * params.num_boxes);
generate_anchors(anchors);
decode_boxes(model_out, boxes_in);
detect_boxes(boxes_in, anchors, boxes_out);
free(anchors);
free(boxes_in);
}