ssd_postprocess/box.c - sw/vec_iree - Git at Google

 /*
  * Copyright 2022 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // SSD box decoding and extracting

 #include "ssd_postprocess/box.h"

 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 static SsdParams params = {
     .num_layers = 4,
     .num_boxes = 1602,
     .input_height = 320,
     .input_width = 320,
     .global_scales = {10, 10, 5, 5},  // y, x, h, w
     .box_zero_points = {115, 129, 125, 119},
     .box_scales = {0.0813235, 0.0786732, 0.0687513, 0.0522251},
     .score_zero_points = {211, 195, 200, 225},
     .score_scales = {0.177373, 0.121247, 0.100491, 0.0550178},
     .score_threshold = 0.5,
     .anchors_per_cell = 3,
     .anchor_base_size = {24.0, 32.0, 40.0, 48.0, 64.0, 80.0, 96.0, 128.0, 160.0,
                          192.0, 256.0, 320.0},
     .anchor_stride = {16, 32, 64, 128}};

 // Set SSD parameters
 void set_params(SsdParams* params_in) { params = *params_in; }

 static inline float dequantize(int val, int zero_point, float scale) {
   return scale * (val - zero_point);
 }

 static inline float sigmoid(float val) { return 1.0 / (1.0 + expf(-val)); }

 // Generate model anchors
 // layer0: 20 * 20 * 3 = 1200
 // layer1: 10 * 10 * 3 = 300
 // layer2:   5 * 5 * 3 = 75
 // layer3:   3 * 3 * 3 = 27
 // total sum:            1602
 static void generate_anchors(BoxCenterEncode* anchors) {
   int idx = 0;
   for (int layer = 0; layer < params.num_layers; ++layer) {
     int height_size = (params.input_height + params.anchor_stride[layer] - 1) /
                       params.anchor_stride[layer];
     int width_size = (params.input_width + params.anchor_stride[layer] - 1) /
                      params.anchor_stride[layer];
     for (int h = 0; h < height_size; h++) {
       for (int w = 0; w < width_size; w++) {
         for (int base = 0; base < params.anchors_per_cell; ++base) {
           anchors[idx].y =
               (float)params.anchor_stride[layer] * h / params.input_height;
           anchors[idx].x =
               (float)params.anchor_stride[layer] * w / params.input_width;
           anchors[idx].h =
               params.anchor_base_size[layer * params.anchors_per_cell + base] /
               params.input_height;
           anchors[idx].w =
               params.anchor_base_size[layer * params.anchors_per_cell + base] /
               params.input_width;
           idx++;
         }
       }
     }
   }
 }

 // Decode boxes (with score) from model inference outputs
 // The locations channel dim is 16 x 3.
 // Each 16 is composed of (4 box coordinates + 6 * 2 landmarks coordinates).
 // We need only the first 4 box coordinates - so want to keep only indexes:
 //  0, 1, 2, 3
 // 16,17,18,19
 // 32,33,34,35
 static void decode_boxes(uint8_t** model_out, BoxCenterEncode* boxes) {
   const int num_coordinates = 16;
   int box_idx = 0;
   for (int layer = 0; layer < params.num_layers; layer++) {
     int height_size = (params.input_height + params.anchor_stride[layer] - 1) /
                       params.anchor_stride[layer];
     int width_size = (params.input_width + params.anchor_stride[layer] - 1) /
                      params.anchor_stride[layer];
     // Boxes at even indicees; scores at odd indices
     uint8_t* boxes_out = model_out[2 * layer];
     uint8_t* scores_out = model_out[2 * layer + 1];
     for (int i = 0; i < height_size * width_size; i++) {
       for (int j = 0; j < params.anchors_per_cell; j++) {
         int score_idx = i * params.anchors_per_cell + j;
         int chan_idx = num_coordinates * score_idx;
         // dequantize box
         boxes[box_idx].y =
             dequantize(boxes_out[chan_idx], params.box_zero_points[layer],
                        params.box_scales[layer]);
         boxes[box_idx].x =
             dequantize(boxes_out[chan_idx + 1], params.box_zero_points[layer],
                        params.box_scales[layer]);
         boxes[box_idx].h =
             dequantize(boxes_out[chan_idx + 2], params.box_zero_points[layer],
                        params.box_scales[layer]);
         boxes[box_idx].w =
             dequantize(boxes_out[chan_idx + 3], params.box_zero_points[layer],
                        params.box_scales[layer]);
         // dequantize score
         float dequant_score =
             dequantize(scores_out[score_idx], params.score_zero_points[layer],
                        params.score_scales[layer]);
         boxes[box_idx].score = sigmoid(dequant_score);
         box_idx++;
       }
     }
   }
 }

 // Convert box from center encoding to corner encoding format
 static void convert_box(const BoxCenterEncode* box_in, BoxCenterEncode* anchor,
                         BoxCornerEncode* box_out) {
   float y_center = box_in->y / params.global_scales[0] * anchor->h + anchor->y;
   float x_center = box_in->x / params.global_scales[1] * anchor->w + anchor->x;
   float half_h = 0.5 * expf(box_in->h / params.global_scales[2]) * anchor->h;
   float half_w = 0.5 * expf(box_in->w / params.global_scales[3]) * anchor->w;

   box_out->ymin = y_center - half_h;
   box_out->xmin = x_center - half_w;
   box_out->ymax = y_center + half_h;
   box_out->xmax = x_center + half_w;
   box_out->score = box_in->score;
 }

 // Detect boxes by score thresholding
 static void detect_boxes(const BoxCenterEncode* boxes_in,
                          BoxCenterEncode* anchors, Boxes* boxes_out) {
   int num_detected_boxes = 0;
   for (int i = 0; i < params.num_boxes; ++i) {
     if (boxes_in[i].score > params.score_threshold) {
       num_detected_boxes++;
     }
   }
   if (!(boxes_out->box)) {
     boxes_out->box =
         (BoxCornerEncode*)malloc(sizeof(BoxCornerEncode) * num_detected_boxes);
   }

   num_detected_boxes = 0;
   for (int i = 0; i < params.num_boxes; ++i) {
     if (boxes_in[i].score > params.score_threshold) {
       convert_box(&(boxes_in[i]), &(anchors[i]),
                   &(boxes_out->box[num_detected_boxes]));
       num_detected_boxes++;
     }
   }
   boxes_out->num_boxes = num_detected_boxes;
 }

 // Decode and extract detected boxes
 void get_detected_boxes(uint8_t** model_out, Boxes* boxes_out) {
   BoxCenterEncode* boxes_in =
       (BoxCenterEncode*)malloc(sizeof(BoxCenterEncode) * params.num_boxes);
   BoxCenterEncode* anchors =
       (BoxCenterEncode*)malloc(sizeof(BoxCenterEncode) * params.num_boxes);

   generate_anchors(anchors);

   decode_boxes(model_out, boxes_in);

   detect_boxes(boxes_in, anchors, boxes_out);

   free(anchors);
   free(boxes_in);
 }
	/*
	* Copyright 2022 Google LLC
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// SSD box decoding and extracting

	#include "ssd_postprocess/box.h"

	#include <math.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	static SsdParams params = {
	.num_layers = 4,
	.num_boxes = 1602,
	.input_height = 320,
	.input_width = 320,
	.global_scales = {10, 10, 5, 5}, // y, x, h, w
	.box_zero_points = {115, 129, 125, 119},
	.box_scales = {0.0813235, 0.0786732, 0.0687513, 0.0522251},
	.score_zero_points = {211, 195, 200, 225},
	.score_scales = {0.177373, 0.121247, 0.100491, 0.0550178},
	.score_threshold = 0.5,
	.anchors_per_cell = 3,
	.anchor_base_size = {24.0, 32.0, 40.0, 48.0, 64.0, 80.0, 96.0, 128.0, 160.0,
	192.0, 256.0, 320.0},
	.anchor_stride = {16, 32, 64, 128}};

	// Set SSD parameters
	void set_params(SsdParams* params_in) { params = *params_in; }

	static inline float dequantize(int val, int zero_point, float scale) {
	return scale * (val - zero_point);
	}

	static inline float sigmoid(float val) { return 1.0 / (1.0 + expf(-val)); }

	// Generate model anchors
	// layer0: 20 * 20 * 3 = 1200
	// layer1: 10 * 10 * 3 = 300
	// layer2: 5 * 5 * 3 = 75
	// layer3: 3 * 3 * 3 = 27
	// total sum: 1602
	static void generate_anchors(BoxCenterEncode* anchors) {
	int idx = 0;
	for (int layer = 0; layer < params.num_layers; ++layer) {
	int height_size = (params.input_height + params.anchor_stride[layer] - 1) /
	params.anchor_stride[layer];
	int width_size = (params.input_width + params.anchor_stride[layer] - 1) /
	params.anchor_stride[layer];
	for (int h = 0; h < height_size; h++) {
	for (int w = 0; w < width_size; w++) {
	for (int base = 0; base < params.anchors_per_cell; ++base) {
	anchors[idx].y =
	(float)params.anchor_stride[layer] * h / params.input_height;
	anchors[idx].x =
	(float)params.anchor_stride[layer] * w / params.input_width;
	anchors[idx].h =
	params.anchor_base_size[layer * params.anchors_per_cell + base] /
	params.input_height;
	anchors[idx].w =
	params.anchor_base_size[layer * params.anchors_per_cell + base] /
	params.input_width;
	idx++;
	}
	}
	}
	}
	}

	// Decode boxes (with score) from model inference outputs
	// The locations channel dim is 16 x 3.
	// Each 16 is composed of (4 box coordinates + 6 * 2 landmarks coordinates).
	// We need only the first 4 box coordinates - so want to keep only indexes:
	// 0, 1, 2, 3
	// 16,17,18,19
	// 32,33,34,35
	static void decode_boxes(uint8_t** model_out, BoxCenterEncode* boxes) {
	const int num_coordinates = 16;
	int box_idx = 0;
	for (int layer = 0; layer < params.num_layers; layer++) {
	int height_size = (params.input_height + params.anchor_stride[layer] - 1) /
	params.anchor_stride[layer];
	int width_size = (params.input_width + params.anchor_stride[layer] - 1) /
	params.anchor_stride[layer];
	// Boxes at even indicees; scores at odd indices
	uint8_t* boxes_out = model_out[2 * layer];
	uint8_t* scores_out = model_out[2 * layer + 1];
	for (int i = 0; i < height_size * width_size; i++) {
	for (int j = 0; j < params.anchors_per_cell; j++) {
	int score_idx = i * params.anchors_per_cell + j;
	int chan_idx = num_coordinates * score_idx;
	// dequantize box
	boxes[box_idx].y =
	dequantize(boxes_out[chan_idx], params.box_zero_points[layer],
	params.box_scales[layer]);
	boxes[box_idx].x =
	dequantize(boxes_out[chan_idx + 1], params.box_zero_points[layer],
	params.box_scales[layer]);
	boxes[box_idx].h =
	dequantize(boxes_out[chan_idx + 2], params.box_zero_points[layer],
	params.box_scales[layer]);
	boxes[box_idx].w =
	dequantize(boxes_out[chan_idx + 3], params.box_zero_points[layer],
	params.box_scales[layer]);
	// dequantize score
	float dequant_score =
	dequantize(scores_out[score_idx], params.score_zero_points[layer],
	params.score_scales[layer]);
	boxes[box_idx].score = sigmoid(dequant_score);
	box_idx++;
	}
	}
	}
	}

	// Convert box from center encoding to corner encoding format
	static void convert_box(const BoxCenterEncode* box_in, BoxCenterEncode* anchor,
	BoxCornerEncode* box_out) {
	float y_center = box_in->y / params.global_scales[0] * anchor->h + anchor->y;
	float x_center = box_in->x / params.global_scales[1] * anchor->w + anchor->x;
	float half_h = 0.5 * expf(box_in->h / params.global_scales[2]) * anchor->h;
	float half_w = 0.5 * expf(box_in->w / params.global_scales[3]) * anchor->w;

	box_out->ymin = y_center - half_h;
	box_out->xmin = x_center - half_w;
	box_out->ymax = y_center + half_h;
	box_out->xmax = x_center + half_w;
	box_out->score = box_in->score;
	}

	// Detect boxes by score thresholding
	static void detect_boxes(const BoxCenterEncode* boxes_in,
	BoxCenterEncode* anchors, Boxes* boxes_out) {
	int num_detected_boxes = 0;
	for (int i = 0; i < params.num_boxes; ++i) {
	if (boxes_in[i].score > params.score_threshold) {
	num_detected_boxes++;
	}
	}
	if (!(boxes_out->box)) {
	boxes_out->box =
	(BoxCornerEncode)malloc(sizeof(BoxCornerEncode) num_detected_boxes);
	}

	num_detected_boxes = 0;
	for (int i = 0; i < params.num_boxes; ++i) {
	if (boxes_in[i].score > params.score_threshold) {
	convert_box(&(boxes_in[i]), &(anchors[i]),
	&(boxes_out->box[num_detected_boxes]));
	num_detected_boxes++;
	}
	}
	boxes_out->num_boxes = num_detected_boxes;
	}

	// Decode and extract detected boxes
	void get_detected_boxes(uint8_t** model_out, Boxes* boxes_out) {
	BoxCenterEncode* boxes_in =
	(BoxCenterEncode)malloc(sizeof(BoxCenterEncode) params.num_boxes);
	BoxCenterEncode* anchors =
	(BoxCenterEncode)malloc(sizeof(BoxCenterEncode) params.num_boxes);

	generate_anchors(anchors);

	decode_boxes(model_out, boxes_in);

	detect_boxes(boxes_in, anchors, boxes_out);

	free(anchors);
	free(boxes_in);
	}