Add the ssd post-processing block

Add the ssd post-processing block, including box decoding and
extractions, and non-max suppression (NMS).

This code is refactored based on
https://spacebeaker-review.googlesource.com/c/shodan/sw/vec-iree/+/24503.

The libraries can be built successfully, and the results have been validated.

Change-Id: I30cf18982def7a6926f40fb7b5831038d58604af
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 314fda5..74835f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -102,6 +102,7 @@
 add_subdirectory(risp4ml)
 
 add_subdirectory(samples)
+add_subdirectory(ssd_postprocess)
 
 # Add pigweed support
 include($ENV{ROOTDIR}/sw/pigweed/pw_build/pigweed.cmake)
diff --git a/README.md b/README.md
index d352caa..049ca2c 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@
 * risp4ml: Vision preprocessing library (Reduced ISP for ML)
 * samples: Codegen and execution of ML models based on IREE
   * simple_vec_mul: Point-wise vector multiplication examples
+* ssd_postprocess: Vision postprocessing library for Single-Shot Detectors (SSD)
 
 ## Build the project
 
diff --git a/ssd_postprocess/CMakeLists.txt b/ssd_postprocess/CMakeLists.txt
new file mode 100644
index 0000000..94a6e52
--- /dev/null
+++ b/ssd_postprocess/CMakeLists.txt
@@ -0,0 +1,31 @@
+iree_cc_library(
+  NAME
+    box
+  HDRS
+    "box.h"
+    "common.h"
+  SRCS
+    "box.c"
+)
+
+iree_cc_library(
+  NAME
+    nms
+  HDRS
+    "common.h"
+    "nms.h"
+  SRCS
+    "nms.c"
+)
+
+iree_cc_library(
+  NAME
+    pipeline
+  HDRS
+    "pipeline.h"
+  SRCS
+    "pipeline.c"
+  DEPS
+    ::box
+    ::nms
+)
diff --git a/ssd_postprocess/box.c b/ssd_postprocess/box.c
new file mode 100644
index 0000000..ff0e7a8
--- /dev/null
+++ b/ssd_postprocess/box.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// SSD box decoding and extracting
+
+#include "ssd_postprocess/box.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static SsdParams params = {
+    .num_layers = 4,
+    .num_boxes = 1602,
+    .input_height = 320,
+    .input_width = 320,
+    .global_scales = {10, 10, 5, 5},  // y, x, h, w
+    .box_zero_points = {115, 129, 125, 119},
+    .box_scales = {0.0813235, 0.0786732, 0.0687513, 0.0522251},
+    .score_zero_points = {211, 195, 200, 225},
+    .score_scales = {0.177373, 0.121247, 0.100491, 0.0550178},
+    .score_threshold = 0.5,
+    .anchors_per_cell = 3,
+    .anchor_base_size = {24.0, 32.0, 40.0, 48.0, 64.0, 80.0, 96.0, 128.0, 160.0,
+                         192.0, 256.0, 320.0},
+    .anchor_stride = {16, 32, 64, 128}};
+
+// Set SSD parameters
+void set_params(SsdParams* params_in) { params = *params_in; }
+
+static inline float dequantize(int val, int zero_point, float scale) {
+  return scale * (val - zero_point);
+}
+
+static inline float sigmoid(float val) { return 1.0 / (1.0 + expf(-val)); }
+
+// Generate model anchors
+// layer0: 20 * 20 * 3 = 1200
+// layer1: 10 * 10 * 3 = 300
+// layer2:   5 * 5 * 3 = 75
+// layer3:   3 * 3 * 3 = 27
+// total sum:            1602
+static void generate_anchors(BoxCenterEncode* anchors) {
+  int idx = 0;
+  for (int layer = 0; layer < params.num_layers; ++layer) {
+    int height_size = (params.input_height + params.anchor_stride[layer] - 1) /
+                      params.anchor_stride[layer];
+    int width_size = (params.input_width + params.anchor_stride[layer] - 1) /
+                     params.anchor_stride[layer];
+    for (int h = 0; h < height_size; h++) {
+      for (int w = 0; w < width_size; w++) {
+        for (int base = 0; base < params.anchors_per_cell; ++base) {
+          anchors[idx].y =
+              (float)params.anchor_stride[layer] * h / params.input_height;
+          anchors[idx].x =
+              (float)params.anchor_stride[layer] * w / params.input_width;
+          anchors[idx].h =
+              params.anchor_base_size[layer * params.anchors_per_cell + base] /
+              params.input_height;
+          anchors[idx].w =
+              params.anchor_base_size[layer * params.anchors_per_cell + base] /
+              params.input_width;
+          idx++;
+        }
+      }
+    }
+  }
+}
+
+// Decode boxes (with score) from model inference outputs
+// The locations channel dim is 16 x 3.
+// Each 16 is composed of (4 box coordinates + 6 * 2 landmarks coordinates).
+// We need only the first 4 box coordinates - so want to keep only indexes:
+//  0, 1, 2, 3
+// 16,17,18,19
+// 32,33,34,35
+static void decode_boxes(uint8_t** model_out, BoxCenterEncode* boxes) {
+  const int num_coordinates = 16;
+  int box_idx = 0;
+  for (int layer = 0; layer < params.num_layers; layer++) {
+    int height_size = (params.input_height + params.anchor_stride[layer] - 1) /
+                      params.anchor_stride[layer];
+    int width_size = (params.input_width + params.anchor_stride[layer] - 1) /
+                     params.anchor_stride[layer];
+    // Boxes at even indicees; scores at odd indices
+    uint8_t* boxes_out = model_out[2 * layer];
+    uint8_t* scores_out = model_out[2 * layer + 1];
+    for (int i = 0; i < height_size * width_size; i++) {
+      for (int j = 0; j < params.anchors_per_cell; j++) {
+        int score_idx = i * params.anchors_per_cell + j;
+        int chan_idx = num_coordinates * score_idx;
+        // dequantize box
+        boxes[box_idx].y =
+            dequantize(boxes_out[chan_idx], params.box_zero_points[layer],
+                       params.box_scales[layer]);
+        boxes[box_idx].x =
+            dequantize(boxes_out[chan_idx + 1], params.box_zero_points[layer],
+                       params.box_scales[layer]);
+        boxes[box_idx].h =
+            dequantize(boxes_out[chan_idx + 2], params.box_zero_points[layer],
+                       params.box_scales[layer]);
+        boxes[box_idx].w =
+            dequantize(boxes_out[chan_idx + 3], params.box_zero_points[layer],
+                       params.box_scales[layer]);
+        // dequantize score
+        float dequant_score =
+            dequantize(scores_out[score_idx], params.score_zero_points[layer],
+                       params.score_scales[layer]);
+        boxes[box_idx].score = sigmoid(dequant_score);
+        box_idx++;
+      }
+    }
+  }
+}
+
+// Convert box from center encoding to corner encoding format
+static void convert_box(const BoxCenterEncode* box_in, BoxCenterEncode* anchor,
+                        BoxCornerEncode* box_out) {
+  float y_center = box_in->y / params.global_scales[0] * anchor->h + anchor->y;
+  float x_center = box_in->x / params.global_scales[1] * anchor->w + anchor->x;
+  float half_h = 0.5 * expf(box_in->h / params.global_scales[2]) * anchor->h;
+  float half_w = 0.5 * expf(box_in->w / params.global_scales[3]) * anchor->w;
+
+  box_out->ymin = y_center - half_h;
+  box_out->xmin = x_center - half_w;
+  box_out->ymax = y_center + half_h;
+  box_out->xmax = x_center + half_w;
+  box_out->score = box_in->score;
+}
+
+// Detect boxes by score thresholding
+static void detect_boxes(const BoxCenterEncode* boxes_in,
+                         BoxCenterEncode* anchors, Boxes* boxes_out) {
+  int num_detected_boxes = 0;
+  for (int i = 0; i < params.num_boxes; ++i) {
+    if (boxes_in[i].score > params.score_threshold) {
+      num_detected_boxes++;
+    }
+  }
+  if (!(boxes_out->box)) {
+    boxes_out->box =
+        (BoxCornerEncode*)malloc(sizeof(BoxCornerEncode) * num_detected_boxes);
+  }
+
+  num_detected_boxes = 0;
+  for (int i = 0; i < params.num_boxes; ++i) {
+    if (boxes_in[i].score > params.score_threshold) {
+      convert_box(&(boxes_in[i]), &(anchors[i]),
+                  &(boxes_out->box[num_detected_boxes]));
+      num_detected_boxes++;
+    }
+  }
+  boxes_out->num_boxes = num_detected_boxes;
+}
+
+// Decode and extract detected boxes
+void get_detected_boxes(uint8_t** model_out, Boxes* boxes_out) {
+  BoxCenterEncode* boxes_in =
+      (BoxCenterEncode*)malloc(sizeof(BoxCenterEncode) * params.num_boxes);
+  BoxCenterEncode* anchors =
+      (BoxCenterEncode*)malloc(sizeof(BoxCenterEncode) * params.num_boxes);
+
+  generate_anchors(anchors);
+
+  decode_boxes(model_out, boxes_in);
+
+  detect_boxes(boxes_in, anchors, boxes_out);
+
+  free(anchors);
+  free(boxes_in);
+}
diff --git a/ssd_postprocess/box.h b/ssd_postprocess/box.h
new file mode 100644
index 0000000..68ac936
--- /dev/null
+++ b/ssd_postprocess/box.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SSD_POSTPROCESS_BOX_H_
+#define SSD_POSTPROCESS_BOX_H_
+
+#include <stdint.h>
+
+#include "ssd_postprocess/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#ifndef SSD_BOX_DIMS
+#define SSD_BOX_DIMS 4
+#endif
+#ifndef SSD_MAX_LAYERS
+#define SSD_MAX_LAYERS 8
+#endif
+#ifndef SSD_MAX_ANCHORS_PER_CELL
+#define SSD_MAX_ANCHORS_PER_CELL 6
+#endif
+
+typedef struct {
+  int num_layers;
+  int num_boxes;
+  int input_height;
+  int input_width;
+  int global_scales[SSD_BOX_DIMS];
+  int box_zero_points[SSD_BOX_DIMS];
+  float box_scales[SSD_BOX_DIMS];
+  int score_zero_points[SSD_BOX_DIMS];
+  float score_scales[SSD_BOX_DIMS];
+  float score_threshold;
+  int anchors_per_cell;
+  float anchor_base_size[SSD_MAX_LAYERS * SSD_MAX_ANCHORS_PER_CELL];
+  int anchor_stride[SSD_MAX_LAYERS];
+} SsdParams;
+
+// Set SSD parameters
+void set_params(SsdParams* params);
+
+// Decode and extract detected boxes from model outputs
+void get_detected_boxes(uint8_t** model_out, Boxes* boxes);
+
+#ifdef __cplusplusS
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // SSD_POSTPROCESS_BOX_H_
diff --git a/ssd_postprocess/common.h b/ssd_postprocess/common.h
new file mode 100644
index 0000000..a88592b
--- /dev/null
+++ b/ssd_postprocess/common.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SSD_POSTPROCESS_COMMON_H_
+#define SSD_POSTPROCESS_COMMON_H_
+
+typedef struct {
+  float y;
+  float x;
+  float h;
+  float w;
+  float score;
+} BoxCenterEncode;
+
+typedef struct {
+  float ymin;
+  float xmin;
+  float ymax;
+  float xmax;
+  float score;
+} BoxCornerEncode;
+
+typedef struct {
+  int num_boxes;
+  BoxCornerEncode* box;
+} Boxes;
+
+#endif  // SSD_POSTPROCESS_COMMON_H_
diff --git a/ssd_postprocess/nms.c b/ssd_postprocess/nms.c
new file mode 100644
index 0000000..4a28302
--- /dev/null
+++ b/ssd_postprocess/nms.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// NMS (Non-Maximum Suppression) algorithm
+
+#include "ssd_postprocess/nms.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef MIN
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef MAX
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+// compute area of one box
+static float compute_box_area(const BoxCornerEncode* box) {
+  const float width = box->xmax - box->xmin;
+  const float height = box->ymax - box->ymin;
+  return MAX(0.0, width * height);
+}
+
+// compute IOU (intersection over union) of two boxes
+static float compute_two_boxes_iou(const BoxCornerEncode* box1,
+                                   const BoxCornerEncode* box2) {
+  const float area1 = compute_box_area(box1);
+  const float area2 = compute_box_area(box2);
+  if (area1 <= 0 || area2 <= 0) return 0.0;
+
+  BoxCornerEncode intersection_box = {.ymin = MAX(box1->ymin, box2->ymin),
+                                      .xmin = MAX(box1->xmin, box2->xmin),
+                                      .ymax = MIN(box1->ymax, box2->ymax),
+                                      .xmax = MIN(box1->xmax, box2->xmax)};
+  float intersection_area = compute_box_area(&intersection_box);
+  return intersection_area / (area1 + area2 - intersection_area);
+}
+
+// comparator for qsort
+static int comparator(const void* p, const void* q) {
+  float x = ((BoxCornerEncode*)p)->score;
+  float y = ((BoxCornerEncode*)q)->score;
+  return (y > x) - (y < x);
+}
+
+// Perform non-maximum suppression algorithm to remove "similar" bounding boxes
+void nms(Boxes* boxes_in, Boxes* boxes_out, const int max_boxes,
+         const float iou_threshold) {
+  int num_boxes = boxes_in->num_boxes;
+  uint8_t* is_suppressed = (uint8_t*)malloc(num_boxes * sizeof(uint8_t));
+  memset(is_suppressed, 0, num_boxes * sizeof(uint8_t));
+
+  // quick sort from greatest to smallest
+  qsort(boxes_in->box, num_boxes, sizeof(BoxCornerEncode), comparator);
+
+  for (int i = 0; i < num_boxes; i++) {
+    if (!is_suppressed[i]) {
+      for (int j = i + 1; j < num_boxes; j++) {
+        if (!is_suppressed[j]) {
+          if (compute_two_boxes_iou(&(boxes_in->box[i]), &(boxes_in->box[j])) >
+              iou_threshold) {
+            is_suppressed[j] = 1;
+          }
+        }
+      }
+    }
+  }
+
+  int ind_out = 0;
+  for (int i = 0; i < num_boxes; i++) {
+    if (ind_out >= max_boxes) break;
+    if (!is_suppressed[i]) {
+      boxes_out->box[ind_out++] = boxes_in->box[i];
+    }
+  }
+  boxes_out->num_boxes = ind_out;
+
+  free(is_suppressed);
+}
diff --git a/ssd_postprocess/nms.h b/ssd_postprocess/nms.h
new file mode 100644
index 0000000..183fc23
--- /dev/null
+++ b/ssd_postprocess/nms.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SSD_POSTPROCESS_NMS_H_
+#define SSD_POSTPROCESS_NMS_H_
+
+#include "ssd_postprocess/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Perform non-maximum suppression algorithm to remove "similar" bounding boxes
+void nms(Boxes* boxes_in, Boxes* boxes_out, const int max_boxes,
+         const float iou_threshold);
+
+#ifdef __cplusplusS
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // SSD_POSTPROCESS_NMS_H_
diff --git a/ssd_postprocess/pipeline.c b/ssd_postprocess/pipeline.c
new file mode 100644
index 0000000..ebe251b
--- /dev/null
+++ b/ssd_postprocess/pipeline.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// SSD post-processing pipeline
+
+#include "ssd_postprocess/pipeline.h"
+
+#include <stdlib.h>
+
+#include "ssd_postprocess/box.h"
+#include "ssd_postprocess/nms.h"
+
+void ssd_postprocess_pipeline(uint8_t** model_out, Boxes* boxes,
+                              const int max_faces, const float iou_threshold) {
+  Boxes boxes_before_nms = {.num_boxes = 0, .box = NULL};
+  // decode and extract detected boxes
+  get_detected_boxes(model_out, &boxes_before_nms);
+  // non-max suppression
+  nms(&boxes_before_nms, boxes, max_faces, iou_threshold);
+
+  if (boxes_before_nms.box) {
+    free(boxes_before_nms.box);
+  }
+}
diff --git a/ssd_postprocess/pipeline.h b/ssd_postprocess/pipeline.h
new file mode 100644
index 0000000..85719a6
--- /dev/null
+++ b/ssd_postprocess/pipeline.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SSD_POSTPROCESS_PIPELINE_H_
+#define SSD_POSTPROCESS_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "ssd_postprocess/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// SSD post-processing pipeline
+void ssd_postprocess_pipeline(uint8_t** model_out, Boxes* boxes,
+                              const int max_faces, const float iou_threshold);
+
+#ifdef __cplusplusS
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // SSD_POSTPROCESS_PIPELINE_H_