Rewrite RISP4ML in plain-C for Shodan

This CL is Part 1 of the comprehensive 3-part changes of integrating RISP4ML into Shodan ML toolchain. This change focuses on re-writing RISP4ML in plain-C.

Part 2 and 3 will be submitted after this CL is merged (codes can be
seen at Patch Set #13).

Details of the 3-part changes:

(Part 1) Completely re-wrote RISP4ML in plain C.
- The original RISP4ML on google3 was written in C++. The memory usage with C++ RISP4ML was significantly larger. Thus, I have completely re-written RISP4ML in plain C. The memory increase with the plain-C RISP4ML is much smaller.
- Made the input/output shape to be configurable arguments instead of constants.
- Made low-level optimizations and simplifications.

(Part 2) Integrated plain-C RISP4ML into Shodan ML toolchain
- Use fssd_25_8bit_v2 as an example to load the binary (raw bayer) file and go through RISP4ML toolchain, and feed the RISP4ML output into IREE flow.
- The fssd_25_8bit_v2 example is up and running, and the correctness of output has been verified (compare output@plain-C risp4ml against output@google3).

(Part 3) Added unit tests for plain-C RISP4ML
- Added unit tests based on pw_unit_test
- Build unit tests in iree_cc_binary
- Run and check unit tests via lit_test

Change-Id: Id17dd01fd3848d9705b8b39ea2866bde3334613c
diff --git a/samples/risp4ml/CMakeLists.txt b/samples/risp4ml/CMakeLists.txt
new file mode 100644
index 0000000..0e9c88b
--- /dev/null
+++ b/samples/risp4ml/CMakeLists.txt
@@ -0,0 +1 @@
+iree_add_all_subdirs()
diff --git a/samples/risp4ml/common/CMakeLists.txt b/samples/risp4ml/common/CMakeLists.txt
new file mode 100644
index 0000000..49346c1
--- /dev/null
+++ b/samples/risp4ml/common/CMakeLists.txt
@@ -0,0 +1,26 @@
+iree_cc_library(
+  NAME
+    image
+  HDRS
+    "image.h"
+  SRCS
+    "image.c"
+)
+
+iree_cc_library(
+  NAME
+    test_utils
+  HDRS
+    "test_utils.h"
+  DEPS
+    ::image
+)
+
+iree_cc_library(
+  NAME
+    utils
+  HDRS
+    "utils.h"
+  SRCS
+    "utils.c"
+)
diff --git a/samples/risp4ml/common/constants.h b/samples/risp4ml/common/constants.h
new file mode 100644
index 0000000..5d3e658
--- /dev/null
+++ b/samples/risp4ml/common/constants.h
@@ -0,0 +1,49 @@
+#ifndef SAMPLES_RISP4ML_COMMON_CONSTANTS_H_
+#define SAMPLES_RISP4ML_COMMON_CONSTANTS_H_
+
+#include <stdint.h>
+
+// Input and output are expected to be 8 bits per pixel
+static const uint16_t kPipeInputBpp = 8;
+static const uint16_t kPipeOutputBpp = 8;
+
+// TODO(b/149969920): modify sign,bitwidths and internal precision for risp4ml
+// Assume 8.8 format
+static const uint16_t kRawPipelineBpp = 16;
+static const uint16_t kRawPipelineInteger = 8;
+static const uint16_t kRawPipelineFraction = 8;
+
+// max = 0xFF.FF = 65535
+// represnting 255 + 255/256
+static const uint16_t kRawPipelineMaxVal = 0xFFFF;
+static const uint16_t kRawPipelineMinVal = 0;
+// min fraction = 0x00.01 = 1/256
+static const uint16_t kRawPipelineMinFraction = 1;
+
+// BayerIndex defines the order in 2x2 normal Bayer quad.
+// +---+---+
+// | R | Gr|
+// +---+---+
+// | Gb| B |
+// +---+---+
+typedef enum {
+  kR = 0,
+  kGr = 1,
+  kGb = 2,
+  kB = 3,
+} BayerIndex;
+
+typedef enum {
+  kRggb = 0,
+  kGrbg = 1,
+  kGbrg = 2,
+  kBggr = 3,
+} BayerPattern;
+
+#define kNumBayerPatterns 4
+
+// TODO(alexkaplan): Add a way to update this based on the image
+// or to make sure the BayerType corresponds to the loaded image
+static const BayerPattern kBayerType = kRggb;
+
+#endif  // SAMPLES_RISP4ML_COMMON_CONSTANTS_H_
diff --git a/samples/risp4ml/common/image.c b/samples/risp4ml/common/image.c
new file mode 100644
index 0000000..5c77445
--- /dev/null
+++ b/samples/risp4ml/common/image.c
@@ -0,0 +1,35 @@
+#include <stdlib.h>
+
+#include "samples/risp4ml/common/image.h"
+
+Image* image_new(uint16_t num_channels, uint16_t height, uint16_t width) {
+  Image* image = (Image*)malloc(sizeof(Image));
+  if (image) {
+    image->num_channels = num_channels;
+    image->height = height;
+    image->width = width;
+    uint32_t num_pixels = width * height * num_channels;
+    image->data = (pixel_type_t*)malloc(num_pixels * sizeof(pixel_type_t));
+  }
+  return image;
+}
+
+void image_delete(Image* image) {
+  if (image) {
+    if (image->data) free(image->data);
+    free(image);
+  }
+}
+
+pixel_type_t* image_pixel(Image* image, uint16_t c, uint16_t y, uint16_t x) {
+  const uint32_t stride_c = image->width * image->height;
+  const uint16_t stride_y = image->width;
+  const uint16_t stride_x = 1;
+  return (image->data + c * stride_c + y * stride_y + x * stride_x);
+}
+
+pixel_type_t* image_row(Image* image, uint16_t c, uint16_t y) {
+  const uint32_t stride_c = image->width * image->height;
+  const uint16_t stride_y = image->width;
+  return (image->data + c * stride_c + y * stride_y);
+}
diff --git a/samples/risp4ml/common/image.h b/samples/risp4ml/common/image.h
new file mode 100644
index 0000000..12c1fb4
--- /dev/null
+++ b/samples/risp4ml/common/image.h
@@ -0,0 +1,43 @@
+#ifndef SAMPLES_RISP4ML_COMMON_IMAGE_H_
+#define SAMPLES_RISP4ML_COMMON_IMAGE_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef uint16_t pixel_type_t;
+
+typedef struct {
+  uint16_t num_channels;
+  uint16_t height;
+  uint16_t width;
+  pixel_type_t* data;
+} Image;
+
+typedef struct {
+  uint16_t num_channels;
+  uint16_t height;
+  uint16_t width;
+  uint8_t* data;
+} ImageU8;
+
+Image* image_new(uint16_t num_channels, uint16_t height, uint16_t width);
+
+void image_delete(Image* image);
+
+pixel_type_t* image_pixel(Image* image, uint16_t c, uint16_t y, uint16_t x);
+
+inline pixel_type_t image_pixel_val(Image* image, uint16_t c, uint16_t y,
+                                    uint16_t x) {
+  return *image_pixel(image, c, y, x);
+}
+
+pixel_type_t* image_row(Image* image, uint16_t c, uint16_t y);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // SAMPLES_RISP4ML_COMMON_IMAGE_H_
diff --git a/samples/risp4ml/common/test_utils.h b/samples/risp4ml/common/test_utils.h
new file mode 100644
index 0000000..616b3da
--- /dev/null
+++ b/samples/risp4ml/common/test_utils.h
@@ -0,0 +1,42 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "samples/risp4ml/common/image.h"
+
+inline pixel_type_t Pattern(uint16_t c, uint16_t y, uint16_t x) {
+  return (pixel_type_t)(x + y * 100 + c * 10000);
+}
+
+inline void FillImage(Image* img) {
+  for (uint16_t c = 0; c < img->num_channels; ++c) {
+    for (uint16_t y = 0; y < img->height; ++y) {
+      for (uint16_t x = 0; x < img->width; ++x) {
+        *image_pixel(img, c, y, x) = Pattern(c, y, x);
+      }
+    }
+  }
+}
+
+// Initializes raw image to random value within min and max range
+inline void InitImageRandom(Image* image, pixel_type_t min_val,
+                            pixel_type_t max_val) {
+  pixel_type_t range = max_val + 1 - min_val;
+  for (uint16_t c = 0; c < image->num_channels; ++c) {
+    for (uint16_t y = 0; y < image->height; ++y) {
+      for (uint16_t x = 0; x < image->width; ++x) {
+        *image_pixel(image, 0, y, x) = (pixel_type_t)(rand() % range + min_val);
+      }
+    }
+  }
+}
+
+// Initializes raw image to raw pixel value
+inline void InitImage(Image* image, pixel_type_t val) {
+  for (uint16_t c = 0; c < image->num_channels; ++c) {
+    for (uint16_t y = 0; y < image->height; ++y) {
+      for (uint16_t x = 0; x < image->width; ++x) {
+        *image_pixel(image, 0, y, x) = val;
+      }
+    }
+  }
+}
diff --git a/samples/risp4ml/common/utils.c b/samples/risp4ml/common/utils.c
new file mode 100644
index 0000000..83386b1
--- /dev/null
+++ b/samples/risp4ml/common/utils.c
@@ -0,0 +1,37 @@
+#include "samples/risp4ml/common/utils.h"
+
+// int GetBayerIndex(int x, int y) {
+//   // The Bayer pattern code defines which color is top left in the quad:
+//   // +---+---+
+//   // | R | Gr|
+//   // +---+---+
+//   // | Gb| B |
+//   // +---+---+
+//   return ((x & 1) + 2 * (y & 1));
+// }
+
+BayerIndex GetBayerIndex(BayerPattern bayerType, uint16_t x, uint16_t y) {
+  // The Bayer pattern code defines which color is top left in the quad:
+  // 0: +---+---+ 1: +---+---+ 2: +---+---+ 3: +---+---+
+  //    | R | Gr|    | Gr| R |    | Gb| B |    | B | Gb|
+  //    +---+---+    +---+---+    +---+---+    +---+---+
+  //    | Gb| B |    | B | Gb|    | R | Gr|    | Gr| R |
+  //    +---+---+    +---+---+    +---+---+    +---+---+
+  // pattern 0 is base pattern and other patterns are shifted versions of the
+  // base
+
+  // Patterns 1 and 3 shift in the x
+  uint16_t x_shift = (uint16_t)(bayerType == kGrbg || bayerType == kBggr);
+  // Patterns 2 and 3 shift in the y
+  uint16_t y_shift = (uint16_t)(bayerType == kGbrg || bayerType == kBggr);
+  return (BayerIndex)(((x + x_shift) & 1) + 2 * ((y + y_shift) & 1));
+}
+
+int BayerMirrorBoundary(int x, int size) {
+  if (x < 0)
+    return (-x + 2 * (-x & 0x1) - 2);
+  else if (x < size)
+    return x;
+  else
+    return 2 * size - x - 2 * ((x - size + 1) & 0x1);
+}
diff --git a/samples/risp4ml/common/utils.h b/samples/risp4ml/common/utils.h
new file mode 100644
index 0000000..bc29ea5
--- /dev/null
+++ b/samples/risp4ml/common/utils.h
@@ -0,0 +1,86 @@
+#ifndef SAMPLES_RISP4ML_COMMON_UTILS_H_
+#define SAMPLES_RISP4ML_COMMON_UTILS_H_
+
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "samples/risp4ml/common/constants.h"
+
+// Return the RAW color channel index at position (x, y) for a given Bayer
+// pattern
+// The Bayer pattern code defines which color is top left in the quad:
+// 0: +---+---+ 1: +---+---+ 2: +---+---+ 3: +---+---+
+//    | R | Gr|    | Gr| R |    | Gb| B |    | B | Gb|
+//    +---+---+    +---+---+    +---+---+    +---+---+
+//    | Gb| B |    | B | Gb|    | R | Gr|    | Gr| R |
+//    +---+---+    +---+---+    +---+---+    +---+---+
+// pattern 0 is base pattern and other patterns are shifted versions of the
+// base
+BayerIndex GetBayerIndex(BayerPattern bayerType, uint16_t x, uint16_t y);
+
+// Get the corresponding index of x in bayer images for when the index is out
+// of bounds and mirrored across the boundary.
+int BayerMirrorBoundary(int x, int size);
+
+inline uint32_t Clamp(uint32_t value, uint32_t low, uint32_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+inline uint16_t SubUnsignedZeroClamp(uint16_t lhs, uint16_t rhs) {
+  return rhs < lhs ? lhs - rhs : 0;
+}
+
+// Count the number of consecutive zeros from LHS in N msbs of the number
+// represented using BPP bits
+inline int ClzMsb(int in, int BPP, int N) {
+  int lz = 0;
+  while (lz < N && (in & (1 << (BPP - lz - 1))) == 0) {
+    ++lz;
+  }
+  return lz;
+}
+
+inline float Roundf(float x) {
+  int d = x < 0 ? x - 0.5 : x + 0.5;
+  return (float)d;
+}
+
+// This function converts floating point value `x` to fixed point with the
+// specified `integer_bit`, `frac_bit`, and `is_signed` flag.
+// TODO(alexkaplan): Detect overflow/underflow.
+inline int FloatToFixedPoint(float x, int integer_bit, int frac_bit,
+                             bool is_signed) {
+  float output_as_float = Roundf(x * (1 << frac_bit));
+  float min_value = 0;
+  float max_value = (1 << (frac_bit + integer_bit)) - 1;
+
+  if (is_signed) {
+    min_value = -(1 << (frac_bit + integer_bit - 1));
+    max_value = (1 << (frac_bit + integer_bit - 1)) - 1;
+  }
+
+  // Clamp to the allowed range.
+  if (output_as_float < min_value) {
+    return (int)min_value;
+  } else if (output_as_float > max_value) {
+    return (int)max_value;
+  }
+  return (int)output_as_float;
+}
+
+// Helper function for fixed point rounding of values.
+inline int Round(int value, int right_shift) {
+  int carry = right_shift == 0 ? 0 : (value >> (right_shift - 1)) & 1;
+  return (value >> right_shift) + carry;
+}
+
+// Helper function for linearly interpolating 2 values. When weight equals 0,
+// output = val0. When weight equals 1.0 (when represented in floating point),
+// output = val1.
+inline int Lerp(int val0, int val1, int weight, int weight_precision) {
+  return val0 + Round((val1 - val0) * weight, weight_precision);
+}
+
+#endif  // SAMPLES_RISP4ML_COMMON_UTILS_H_
diff --git a/samples/risp4ml/isp_stages/CMakeLists.txt b/samples/risp4ml/isp_stages/CMakeLists.txt
new file mode 100644
index 0000000..54022b7
--- /dev/null
+++ b/samples/risp4ml/isp_stages/CMakeLists.txt
@@ -0,0 +1,71 @@
+iree_cc_library(
+  NAME
+    blc
+  HDRS
+    "blc.h"
+  SRCS
+    "blc.c"
+  DEPS
+    samples::risp4ml::common::image
+    samples::risp4ml::common::utils
+)
+
+iree_cc_library(
+  NAME
+    demosaic
+  HDRS
+    "demosaic.h"
+  SRCS
+    "demosaic.c"
+  DEPS
+    samples::risp4ml::common::image
+    samples::risp4ml::common::utils
+)
+
+iree_cc_library(
+  NAME
+    dg
+  HDRS
+    "dg.h"
+  SRCS
+    "dg.c"
+  DEPS
+    samples::risp4ml::common::image
+    samples::risp4ml::common::utils
+)
+
+iree_cc_library(
+  NAME
+    downscale
+  HDRS
+    "downscale.h"
+  SRCS
+    "downscale.c"
+  DEPS
+    samples::risp4ml::common::image
+    samples::risp4ml::common::utils
+)
+
+iree_cc_library(
+  NAME
+    gamma
+  HDRS
+    "gamma.h"
+  SRCS
+    "gamma.c"
+  DEPS
+    samples::risp4ml::common::image
+    samples::risp4ml::common::utils
+)
+
+iree_cc_library(
+  NAME
+    wbg
+  HDRS
+    "wbg.h"
+  SRCS
+    "wbg.c"
+  DEPS
+    samples::risp4ml::common::image
+    samples::risp4ml::common::utils
+)
diff --git a/samples/risp4ml/isp_stages/blc.c b/samples/risp4ml/isp_stages/blc.c
new file mode 100644
index 0000000..26e36b5
--- /dev/null
+++ b/samples/risp4ml/isp_stages/blc.c
@@ -0,0 +1,28 @@
+#include "samples/risp4ml/common/utils.h"
+#include "samples/risp4ml/isp_stages/blc.h"
+
+static BlcParams blc_params = {.enable = true,
+                               .offsets = {2048, 2048, 2048, 2048}};
+
+void set_blc_params(BlcParams* params) { blc_params = *params; }
+
+void blc_process(Image* input, Image* output) {
+  if (!blc_params.enable) {
+    *output = *input;
+    return;
+  }
+
+  uint16_t height = input->height;
+  uint16_t width = input->width;
+
+  for (uint16_t y = 0; y < height; ++y) {
+    const pixel_type_t* in_line = image_row(input, 0, y);
+    pixel_type_t* out_line = image_row(output, 0, y);
+
+    for (uint16_t x = 0; x < width; ++x) {
+      BayerIndex bayer_index = GetBayerIndex(kBayerType, x, y);
+      out_line[x] =
+          SubUnsignedZeroClamp(in_line[x], blc_params.offsets[bayer_index]);
+    }
+  }
+}
diff --git a/samples/risp4ml/isp_stages/blc.h b/samples/risp4ml/isp_stages/blc.h
new file mode 100644
index 0000000..4ad1466
--- /dev/null
+++ b/samples/risp4ml/isp_stages/blc.h
@@ -0,0 +1,23 @@
+#ifndef SAMPLES_RISP4ML_ISP_STAGES_BLC_H_
+#define SAMPLES_RISP4ML_ISP_STAGES_BLC_H_
+
+#include "samples/risp4ml/common/image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct {
+  bool enable;
+  pixel_type_t offsets[kNumBayerPatterns];
+} BlcParams;
+
+void set_blc_params(BlcParams* params);
+
+void blc_process(Image* input, Image* output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // SAMPLES_RISP4ML_ISP_STAGES_BLC_H_
diff --git a/samples/risp4ml/isp_stages/demosaic.c b/samples/risp4ml/isp_stages/demosaic.c
new file mode 100644
index 0000000..55a784b
--- /dev/null
+++ b/samples/risp4ml/isp_stages/demosaic.c
@@ -0,0 +1,78 @@
+#include <assert.h>
+
+#include "samples/risp4ml/common/utils.h"
+#include "samples/risp4ml/isp_stages/demosaic.h"
+
+#define kRgbColorChannels 3
+
+static DemosaicParams demosaic_params = {.enable = true};
+
+void set_demosaic_params(DemosaicParams* params) { demosaic_params = *params; }
+
+// Basic bilinear demosaic
+void demosaic_process(Image* input, Image* output) {
+  if (!demosaic_params.enable) {
+    return;
+  }
+  uint16_t height = input->height;
+  uint16_t width = input->width;
+
+  const pixel_type_t* line_buffers[kRgbColorChannels];
+  int x_offset[kRgbColorChannels];
+
+  for (uint16_t y = 0; y < height; ++y) {
+    line_buffers[0] = (y) ? image_row(input, 0, y - 1) : image_row(input, 0, 1);
+    line_buffers[1] = image_row(input, 0, y);
+    line_buffers[2] = (y < height - 1) ? image_row(input, 0, y + 1)
+                                       : image_row(input, 0, height - 2);
+
+    for (uint16_t x = 0; x < width; ++x) {
+      for (uint16_t c = 0; c < kRgbColorChannels; ++c) {
+        x_offset[c] = BayerMirrorBoundary(x - 1 + c, width);
+      }
+
+      BayerIndex bayer_index = GetBayerIndex(kBayerType, x, y);
+      switch (bayer_index) {
+        case (kR): {
+          *image_pixel(output, 0, y, x) = line_buffers[1][x_offset[1]];
+          *image_pixel(output, 1, y, x) =
+              (line_buffers[0][x_offset[1]] + line_buffers[2][x_offset[1]] +
+               line_buffers[1][x_offset[0]] + line_buffers[1][x_offset[2]]) /
+              4;
+          *image_pixel(output, 2, y, x) =
+              (line_buffers[0][x_offset[0]] + line_buffers[0][x_offset[2]] +
+               line_buffers[2][x_offset[0]] + line_buffers[2][x_offset[2]]) /
+              4;
+        }; break;
+        case (kGr): {
+          *image_pixel(output, 0, y, x) =
+              (line_buffers[1][x_offset[0]] + line_buffers[1][x_offset[2]]) / 2;
+          *image_pixel(output, 1, y, x) = line_buffers[1][x_offset[1]];
+          *image_pixel(output, 2, y, x) =
+              (line_buffers[0][x_offset[1]] + line_buffers[2][x_offset[1]]) / 2;
+        }; break;
+        case (kGb): {
+          *image_pixel(output, 0, y, x) =
+              (line_buffers[0][x_offset[1]] + line_buffers[2][x_offset[1]]) / 2;
+          *image_pixel(output, 1, y, x) = line_buffers[1][x_offset[1]];
+          *image_pixel(output, 2, y, x) =
+              (line_buffers[1][x_offset[0]] + line_buffers[1][x_offset[2]]) / 2;
+        }; break;
+        case (kB): {
+          *image_pixel(output, 0, y, x) =
+              (line_buffers[0][x_offset[0]] + line_buffers[0][x_offset[2]] +
+               line_buffers[2][x_offset[0]] + line_buffers[2][x_offset[2]]) /
+              4;
+          *image_pixel(output, 1, y, x) =
+              (line_buffers[0][x_offset[1]] + line_buffers[2][x_offset[1]] +
+               line_buffers[1][x_offset[0]] + line_buffers[1][x_offset[2]]) /
+              4;
+          *image_pixel(output, 2, y, x) = line_buffers[1][x_offset[1]];
+        }; break;
+        default: {
+          assert(0 && "Unexpected channel index");
+        }
+      }
+    }
+  }
+}
diff --git a/samples/risp4ml/isp_stages/demosaic.h b/samples/risp4ml/isp_stages/demosaic.h
new file mode 100644
index 0000000..d32ae98
--- /dev/null
+++ b/samples/risp4ml/isp_stages/demosaic.h
@@ -0,0 +1,22 @@
+#ifndef SAMPLES_RISP4ML_ISP_STAGES_DEMOSAIC_H_
+#define SAMPLES_RISP4ML_ISP_STAGES_DEMOSAIC_H_
+
+#include "samples/risp4ml/common/image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct {
+  bool enable;
+} DemosaicParams;
+
+void set_demosaic_params(DemosaicParams* params);
+
+void demosaic_process(Image* input, Image* output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // SAMPLES_RISP4ML_ISP_STAGES_DEMOSAIC_H_
diff --git a/samples/risp4ml/isp_stages/dg.c b/samples/risp4ml/isp_stages/dg.c
new file mode 100644
index 0000000..73f7f46
--- /dev/null
+++ b/samples/risp4ml/isp_stages/dg.c
@@ -0,0 +1,36 @@
+#include "samples/risp4ml/common/utils.h"
+#include "samples/risp4ml/isp_stages/dg.h"
+
+static const uint16_t kDgFractional = kRawPipelineFraction;
+static const uint16_t kDgUnityGain = 1 << kDgFractional;
+static DgParams dg_params = {
+    .enable = true,
+    .gains = {kDgUnityGain, kDgUnityGain, kDgUnityGain, kDgUnityGain}};
+
+void set_dg_params(DgParams* params) { dg_params = *params; }
+
+void dg_process(Image* input, Image* output) {
+  if (!dg_params.enable) {
+    *output = *input;
+    return;
+  }
+
+  uint16_t height = input->height;
+  uint16_t width = input->width;
+
+  for (uint16_t y = 0; y < height; ++y) {
+    const pixel_type_t* in_line = image_row(input, 0, y);
+    pixel_type_t* out_line = image_row(output, 0, y);
+
+    for (uint16_t x = 0; x < width; ++x) {
+      BayerIndex bayer_index = GetBayerIndex(kBayerType, x, y);
+      uint16_t input_val = in_line[x];
+      // + (1 << (kDgFractional -1)) adds 0.5 for more accurate rounding
+      uint32_t scaled_pixel =
+          input_val * dg_params.gains[bayer_index] + (1 << (kDgFractional - 1));
+
+      out_line[x] = (pixel_type_t)Clamp(scaled_pixel >> kDgFractional, 0,
+                                        kRawPipelineMaxVal);
+    }
+  }
+}
diff --git a/samples/risp4ml/isp_stages/dg.h b/samples/risp4ml/isp_stages/dg.h
new file mode 100644
index 0000000..1a0bc1b
--- /dev/null
+++ b/samples/risp4ml/isp_stages/dg.h
@@ -0,0 +1,23 @@
+#ifndef SAMPLES_RISP4ML_ISP_STAGES_DG_H_
+#define SAMPLES_RISP4ML_ISP_STAGES_DG_H_
+
+#include "samples/risp4ml/common/image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct {
+  bool enable;
+  pixel_type_t gains[kNumBayerPatterns];
+} DgParams;
+
+void set_dg_params(DgParams* params);
+
+void dg_process(Image* input, Image* output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // SAMPLES_RISP4ML_ISP_STAGES_DG_H_
diff --git a/samples/risp4ml/isp_stages/downscale.c b/samples/risp4ml/isp_stages/downscale.c
new file mode 100644
index 0000000..ac8e6f5
--- /dev/null
+++ b/samples/risp4ml/isp_stages/downscale.c
@@ -0,0 +1,122 @@
+#include <assert.h>
+
+#include "samples/risp4ml/common/utils.h"
+#include "samples/risp4ml/isp_stages/downscale.h"
+
+static const uint16_t kScalePrecision = 8;
+static const uint16_t kScaleFixedOne = (1 << kScalePrecision);
+
+static DownscaleParams params = {
+    .enable = true,
+    .scale_precision = kScalePrecision,
+    .interpolate_precision = 8,
+    .interpolate_shift = 2,
+    .scale_fixed_one = kScaleFixedOne,
+    .scale_fraction_mask = kScaleFixedOne - 1,
+    .weight_shift = 0,
+    .hor_scale_factor = kScaleFixedOne,
+    .ver_scale_factor = kScaleFixedOne,
+    .ver_initial_offset = 0,
+    .hor_initial_offset = 0,
+};
+
+void set_downscale_param(DownscaleParams* in_params) { params = *in_params; }
+
+void set_downscale_factor(Image* input, ImageU8* output) {
+  params.hor_scale_factor =
+      (kScaleFixedOne * input->width - params.hor_initial_offset) /
+      output->width;
+  params.ver_scale_factor =
+      (kScaleFixedOne * input->height - params.ver_initial_offset) /
+      output->height;
+}
+
+// Basic bilinear downscale
+
+// Resamples image using bilinear interpolation.
+// 'output' is modified by this function to store the output image.
+void downscale_process(Image* input, ImageU8* output) {
+  assert(input->num_channels == output->num_channels);
+  if (!params.enable) {
+    return;
+  }
+  for (uint16_t channel = 0; channel < output->num_channels; ++channel) {
+    // Each output pixel at (x, y) is sampled at (X, Y) in the input image
+    // coordinate according to the following formula:
+    //    Y = (y * ver_scale_factor) + ver_initial_offset
+    //    X = (x * hor_scale_factor) + hor_initial_offset
+    // `accumulated_pos_*` is the location in the inout image calculated by
+    // repeated addition of *_scale_factor
+    // `integer_pos_*` is the nearest integer index at top/left side.
+    uint32_t accumulated_pos_y = params.ver_initial_offset;
+    for (uint16_t y = 0; y < output->height; ++y) {
+      uint32_t integer_pos_y = accumulated_pos_y >> params.scale_precision;
+      uint32_t y0 = Clamp(integer_pos_y, 0, input->height - 1);
+      uint32_t y1 = Clamp(integer_pos_y + 1, 0, input->height - 1);
+      // The fractional part of the accumulated position gives us the weight but
+      // in scale_precision, since we want the weights to be in
+      // interpolate_precision we shift by their bitwidth difference which is
+      // weight_shift
+      uint32_t weight_y = Round(accumulated_pos_y & params.scale_fraction_mask,
+                                params.weight_shift);
+      uint32_t accumulated_pos_x = params.hor_initial_offset;
+
+      for (uint16_t x = 0; x < output->width; ++x) {
+        uint32_t integer_pos_x = accumulated_pos_x >> params.scale_precision;
+        uint32_t x0 = Clamp(integer_pos_x, 0, input->width - 1);
+        uint32_t x1 = Clamp(integer_pos_x + 1, 0, input->width - 1);
+        uint32_t weight_x =
+            Round(accumulated_pos_x & params.scale_fraction_mask,
+                  params.weight_shift);
+
+        // Perform vertical interpolation first to get p0 and p1,
+        // then horizontal interpolation to get output value.
+        // `interpolate_shift` is for preserving floating point for interpolated
+        // values to avoid incremented quantization errors.
+        //   (x0, y0)             (x1, y0)
+        //      |                    |
+        //      v                    v
+        //      p0  ->  output  <-   p1
+        //      ^                    ^
+        //      |                    |
+        //   (x0, y1)             (x1, y1)
+
+        uint32_t p0 = Lerp(
+            image_pixel_val(input, channel, y0, x0) << params.interpolate_shift,
+            image_pixel_val(input, channel, y1, x0) << params.interpolate_shift,
+            weight_y, params.interpolate_precision);
+        uint32_t p1 = Lerp(
+            image_pixel_val(input, channel, y0, x1) << params.interpolate_shift,
+            image_pixel_val(input, channel, y1, x1) << params.interpolate_shift,
+            weight_y, params.interpolate_precision);
+        // To normalize the output we shift right back by interpolate shift
+        uint32_t tmp_interpolated =
+            Round(Lerp(p0, p1, weight_x, params.interpolate_precision),
+                  params.interpolate_shift);
+
+        // As this is the final stage and we'd like to have the output image in
+        // an interleaved RGBRGBRGB format.
+        // The accessor (*output)(channel, y, x)
+        // is assuming planar layout of RGB channels - uses stride_c_ which is
+        // set in the constructor to stride_c_ = width * height.
+        // Hence image data is accessed and manipulated directly.
+        // Also should avoid using stride_c, stride_x and stride_y as they are
+        // defined in the image class with the planar layout assumption.
+        const uint16_t interleaved_stride_c = 1;
+        const uint16_t interleaved_stride_x = output->num_channels;
+        const uint32_t interleaved_stride_y =
+            output->num_channels * output->width;
+
+        // Shift interpolated result to output bitwidth. Not rounding to avoid
+        // overflow to 256.
+        output->data[y * interleaved_stride_y + x * interleaved_stride_x +
+                     channel * interleaved_stride_c] =
+            (uint8_t)(tmp_interpolated >>
+                      (kRawPipelineBpp - kPipeOutputBpp));  // 16 - 8
+
+        accumulated_pos_x += params.hor_scale_factor;
+      }
+      accumulated_pos_y += params.ver_scale_factor;
+    }
+  }
+}
diff --git a/samples/risp4ml/isp_stages/downscale.h b/samples/risp4ml/isp_stages/downscale.h
new file mode 100644
index 0000000..c28f697
--- /dev/null
+++ b/samples/risp4ml/isp_stages/downscale.h
@@ -0,0 +1,50 @@
+#ifndef SAMPLES_RISP4ML_ISP_STAGES_DOWNSCALE_H_
+#define SAMPLES_RISP4ML_ISP_STAGES_DOWNSCALE_H_
+
+#include "samples/risp4ml/common/image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct {
+  bool enable;
+  // scale_precision is the number of fractional bits used for scale factors and
+  // initial offsets
+  uint32_t scale_precision;
+  // interpolate_precision is the number of fractional bits used for
+  // interpolation weights
+  uint32_t interpolate_precision;
+  // interpolate_shift is the shift for pixel value before interpolation to
+  // avoid rounding error.
+  uint32_t interpolate_shift;
+  uint32_t scale_fixed_one;
+  uint32_t scale_fraction_mask;
+  uint32_t weight_shift;
+
+  // hor_scale_factor and ver_scale_factor are the downscaling ratios between
+  // input size and output size
+  // Example: Running the scaler on 4x4 image with hor_scale=2 and ver_scale=2
+  // will result in 2x2 image
+  // TODO(b/179302796): convert scaling factors to be floats.
+  // Use FloatToFixedPoint() to facilitate this.
+  uint32_t hor_scale_factor;
+  uint32_t ver_scale_factor;
+
+  // hor_initial_offset and ver_initial_offset are the offset of the first
+  // output pixel from the first input pixel in each direction respectively
+  uint32_t ver_initial_offset;
+  uint32_t hor_initial_offset;
+} DownscaleParams;
+
+void set_downscale_param(DownscaleParams* params);
+
+void set_downscale_factor(Image* input, ImageU8* output);
+
+void downscale_process(Image* input, ImageU8* output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // SAMPLES_RISP4ML_ISP_STAGES_DOWNSCALE_H_
diff --git a/samples/risp4ml/isp_stages/gamma.c b/samples/risp4ml/isp_stages/gamma.c
new file mode 100644
index 0000000..f0e9f51
--- /dev/null
+++ b/samples/risp4ml/isp_stages/gamma.c
@@ -0,0 +1,87 @@
+#include "samples/risp4ml/common/utils.h"
+#include "samples/risp4ml/isp_stages/gamma.h"
+
+#define kRgbColorChannels 3
+
+static const uint16_t kRgbPipelineBpp = 16;
+static const uint32_t kRgbPipelineMaxVal = (1 << kRgbPipelineBpp) - 1;
+
+// Fixed HW Parameters
+static const uint8_t kGammaNumberSegments = 4;
+static const uint8_t kGammaLogSegmentOffsets[] = {0, 3, 2, 1};
+static const uint8_t kGammaLogSegmentSpacing[] = {8, 9, 10, 11};
+static const uint16_t kGammaLogNumberPoints[] = {5, 4, 4, 4};
+static const uint16_t kGammaSegmentLutOffset[] = {0, 32, 48, 64};
+
+static GammaParams gamma_params = {
+    .enable = true,
+    .lut = {0,     3255,  5552,  7237,  8618,  9809,  10868, 11828, 12710,
+            13531, 14300, 15026, 15713, 16368, 16995, 17596, 18173, 18731,
+            19269, 19790, 20295, 20786, 21264, 21728, 22182, 22624, 23056,
+            23479, 23892, 24297, 24694, 25083, 25466, 26209, 26928, 27623,
+            28298, 28953, 29590, 30211, 30816, 31406, 31983, 32547, 33099,
+            33640, 34170, 34689, 35199, 36192, 37151, 38080, 38980, 39855,
+            40705, 41534, 42341, 43129, 43899, 44652, 45389, 46111, 46818,
+            47512, 48192, 49517, 50798, 52037, 53239, 54407, 55542, 56648,
+            57726, 58778, 59806, 60811, 61794, 62757, 63702, 64627, 65535}};
+
+void set_gamma_params(GammaParams* params) { gamma_params = *params; }
+
+void gamma_process(Image* input, Image* output) {
+  uint16_t height = input->height;
+  uint16_t width = input->width;
+
+  const pixel_type_t* in_line[kRgbColorChannels];
+  pixel_type_t* out_line[kRgbColorChannels];
+
+  for (uint16_t y = 0; y < height; ++y) {
+    for (uint16_t c = 0; c < kRgbColorChannels; ++c) {
+      in_line[c] = image_row(input, c, y);
+      out_line[c] = image_row(output, c, y);
+    }
+
+    for (uint16_t x = 0; x < width; ++x) {
+      for (uint16_t c = 0; c < kRgbColorChannels; ++c) {
+        if (!gamma_params.enable) {
+          out_line[c][x] = in_line[c][x];
+        } else {
+          pixel_type_t pixel_val =
+              (pixel_type_t)Clamp(in_line[c][x], 0, kRgbPipelineMaxVal);
+
+          // Determine segment
+          int segment_index =
+              (kGammaNumberSegments - 1) -
+              ClzMsb(pixel_val, kRgbPipelineBpp, kGammaNumberSegments - 1);
+          uint16_t segment_left =
+              segment_index ? 1 << (kRgbPipelineBpp -
+                                    kGammaLogSegmentOffsets[segment_index])
+                            : 0;
+
+          // Bin index
+          int bin_index = ((pixel_val - segment_left) >>
+                           kGammaLogSegmentSpacing[segment_index]) +
+                          kGammaSegmentLutOffset[segment_index];
+
+          int offset_within_bin =
+              (pixel_val - segment_left) &
+              ((1 << kGammaLogSegmentSpacing[segment_index]) - 1);
+
+          uint16_t l_val = gamma_params.lut[bin_index];
+          uint16_t r_val = gamma_params.lut[bin_index + 1];
+
+          uint16_t bin_size = 1 << kGammaLogSegmentSpacing[segment_index];
+
+          uint32_t lerp_val = (l_val * (bin_size - offset_within_bin) +
+                               r_val * offset_within_bin + (bin_size >> 1)) >>
+                              kGammaLogSegmentSpacing[segment_index];
+
+          // Clamping is not requied
+          // TODO(alexkaplan): The comment above is from gChips source.
+          // this calc needs to be checked carefully:
+          //
+          out_line[c][x] = (pixel_type_t)lerp_val;
+        }
+      }
+    }
+  }
+}
diff --git a/samples/risp4ml/isp_stages/gamma.h b/samples/risp4ml/isp_stages/gamma.h
new file mode 100644
index 0000000..ad6d61e
--- /dev/null
+++ b/samples/risp4ml/isp_stages/gamma.h
@@ -0,0 +1,25 @@
+#ifndef SAMPLES_RISP4ML_ISP_STAGES_GAMMA_H_
+#define SAMPLES_RISP4ML_ISP_STAGES_GAMMA_H_
+
+#include "samples/risp4ml/common/image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define kGammaNumberPoints 81
+
+typedef struct {
+  bool enable;
+  pixel_type_t lut[kGammaNumberPoints];
+} GammaParams;
+
+void set_gamma_params(GammaParams* params);
+
+void gamma_process(Image* input, Image* output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // SAMPLES_RISP4ML_ISP_STAGES_GAMMA_H_
diff --git a/samples/risp4ml/isp_stages/wbg.c b/samples/risp4ml/isp_stages/wbg.c
new file mode 100644
index 0000000..3e84fb2
--- /dev/null
+++ b/samples/risp4ml/isp_stages/wbg.c
@@ -0,0 +1,116 @@
+#include <assert.h>
+
+#include "samples/risp4ml/common/utils.h"
+#include "samples/risp4ml/isp_stages/wbg.h"
+
+#define kBayerColorChannels 4
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+static const uint16_t kWbgFractional = kRawPipelineFraction;
+static const uint16_t kWbgUnityGain = 1 << kWbgFractional;
+static WbgParams wbg_params = {
+    .enable = true,
+    .fixed = false,
+    .gains = {kWbgUnityGain, kWbgUnityGain, kWbgUnityGain, kWbgUnityGain}};
+
+void set_wbg_params(WbgParams* params) { wbg_params = *params; }
+
+static void compute_wbg_gain(Image* input) {
+  // Calculate the white-balance gain values using the "gray world" algorithm
+  uint16_t height = input->height;
+  uint16_t width = input->width;
+
+  pixel_type_t* in_line;
+
+  int64_t sum_of_reds = 0;
+  uint32_t num_of_reds = 0;
+  // will use only one of the greens for scaling, since the difference between
+  // the two green sensor pixels is negligible
+  int64_t sum_of_greens = 0;
+  uint32_t num_of_greens = 0;
+  int64_t sum_of_blues = 0;
+  uint32_t num_of_blues = 0;
+
+  for (uint16_t y = 0; y < height; ++y) {
+    in_line = image_row(input, 0, y);
+    for (uint16_t x = 0; x < width; ++x) {
+      BayerIndex bayer_index = GetBayerIndex(kBayerType, x, y);
+      switch (bayer_index) {
+        case (kR): {
+          sum_of_reds += in_line[x];
+          num_of_reds++;
+        }; break;
+        case (kGr): {
+          sum_of_greens += in_line[x];
+          num_of_greens++;
+        }; break;
+        case (kGb): {
+          sum_of_greens += in_line[x];
+          num_of_greens++;
+        }; break;
+        case (kB): {
+          sum_of_blues += in_line[x];
+          num_of_blues++;
+        }; break;
+        default: {
+          assert(0 && "Unexpected channel index");
+        }
+      }
+    }
+  }
+
+  // scale values to green channel
+  float average_red = 1.0 * sum_of_reds / num_of_reds;
+  float average_green = 1.0 * sum_of_greens / num_of_greens;
+  float average_blue = 1.0 * sum_of_blues / num_of_blues;
+
+  float max_average = MAX(MAX(average_red, average_green), average_blue);
+
+  // Convert the float value to fixed point representation, i.e. 0xFF.FF
+  uint32_t red_wb = FloatToFixedPoint(max_average / average_red,
+                                      kRawPipelineInteger, kRawPipelineFraction,
+                                      /*bool is_signed*/ false);
+  uint32_t green_wb = FloatToFixedPoint(
+      max_average / average_green, kRawPipelineInteger, kRawPipelineFraction,
+      /*bool is_signed*/ false);
+  uint32_t blue_wb = FloatToFixedPoint(
+      max_average / average_blue, kRawPipelineInteger, kRawPipelineFraction,
+      /*bool is_signed*/ false);
+
+  wbg_params.gains[0] = red_wb;
+  wbg_params.gains[1] = green_wb;
+  wbg_params.gains[2] = green_wb;
+  wbg_params.gains[3] = blue_wb;
+}
+
+void wbg_process(Image* input, Image* output) {
+  if (!wbg_params.enable) {
+    *output = *input;
+    return;
+  }
+
+  uint16_t height = input->height;
+  uint16_t width = input->width;
+
+  const pixel_type_t* in_line;
+  pixel_type_t* out_line;
+
+  if (!wbg_params.fixed) {
+    compute_wbg_gain(input);
+  }
+
+  for (uint16_t y = 0; y < height; ++y) {
+    in_line = image_row(input, 0, y);
+    out_line = image_row(output, 0, y);
+
+    for (uint16_t x = 0; x < width; ++x) {
+      BayerIndex bayer_index = GetBayerIndex(kBayerType, x, y);
+      uint32_t input_val = in_line[x];
+      uint32_t scaled_pixel = (input_val * wbg_params.gains[bayer_index] +
+                               (1 << (kWbgFractional - 1))) >>
+                              kWbgFractional;
+      out_line[x] = (pixel_type_t)Clamp(scaled_pixel, kRawPipelineMinVal,
+                                        kRawPipelineMaxVal);
+    }
+  }
+}
diff --git a/samples/risp4ml/isp_stages/wbg.h b/samples/risp4ml/isp_stages/wbg.h
new file mode 100644
index 0000000..2af1b84
--- /dev/null
+++ b/samples/risp4ml/isp_stages/wbg.h
@@ -0,0 +1,24 @@
+#ifndef SAMPLES_RISP4ML_ISP_STAGES_WBG_H_
+#define SAMPLES_RISP4ML_ISP_STAGES_WBG_H_
+
+#include "samples/risp4ml/common/image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct {
+  bool enable;
+  bool fixed;
+  uint32_t gains[kNumBayerPatterns];
+} WbgParams;
+
+void set_wbg_params(WbgParams* params);
+
+void wbg_process(Image* input, Image* output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // SAMPLES_RISP4ML_ISP_STAGES_WBG_H_
diff --git a/samples/risp4ml/pipeline/CMakeLists.txt b/samples/risp4ml/pipeline/CMakeLists.txt
new file mode 100644
index 0000000..e6447c6
--- /dev/null
+++ b/samples/risp4ml/pipeline/CMakeLists.txt
@@ -0,0 +1,16 @@
+iree_cc_library(
+  NAME
+    pipeline
+  HDRS
+    "pipeline.h"
+  SRCS
+    "pipeline.c"
+  DEPS
+    samples::risp4ml::common::image
+    samples::risp4ml::isp_stages::blc
+    samples::risp4ml::isp_stages::demosaic
+    samples::risp4ml::isp_stages::dg
+    samples::risp4ml::isp_stages::downscale
+    samples::risp4ml::isp_stages::gamma
+    samples::risp4ml::isp_stages::wbg
+)
diff --git a/samples/risp4ml/pipeline/pipeline.c b/samples/risp4ml/pipeline/pipeline.c
new file mode 100644
index 0000000..ec561b2
--- /dev/null
+++ b/samples/risp4ml/pipeline/pipeline.c
@@ -0,0 +1,43 @@
+#include "samples/risp4ml/common/utils.h"
+#include "samples/risp4ml/isp_stages/blc.h"
+#include "samples/risp4ml/isp_stages/demosaic.h"
+#include "samples/risp4ml/isp_stages/dg.h"
+#include "samples/risp4ml/isp_stages/downscale.h"
+#include "samples/risp4ml/isp_stages/gamma.h"
+#include "samples/risp4ml/isp_stages/wbg.h"
+#include "samples/risp4ml/pipeline/pipeline.h"
+
+void isp_pipeline(ImageU8 *input, ImageU8 *output) {
+  Image *input_image =
+      image_new(input->num_channels, input->height, input->width);
+  // shift the 8bits wide input to 16bits (the processing pipeline bitwidth)
+  uint32_t input_dimensions =
+      input->num_channels * input->height * input->width;
+  for (uint32_t i = 0; i < input_dimensions; ++i) {
+    input_image->data[i] = input->data[i] << kRawPipelineFraction;  // 8
+  }
+
+  Image *image1 = image_new(input->num_channels, input->height, input->width);
+  blc_process(input_image, image1);
+  image_delete(input_image);
+
+  Image *image2 = image_new(input->num_channels, input->height, input->width);
+  dg_process(image1, image2);
+  image_delete(image1);
+
+  Image *image3 = image_new(input->num_channels, input->height, input->width);
+  wbg_process(image2, image3);
+  image_delete(image2);
+
+  Image *image4 = image_new(output->num_channels, input->height, input->width);
+  demosaic_process(image3, image4);
+  image_delete(image3);
+
+  Image *image5 = image_new(output->num_channels, input->height, input->width);
+  gamma_process(image4, image5);
+  image_delete(image4);
+
+  set_downscale_factor(image5, output);
+  downscale_process(image5, output);
+  image_delete(image5);
+}
diff --git a/samples/risp4ml/pipeline/pipeline.h b/samples/risp4ml/pipeline/pipeline.h
new file mode 100644
index 0000000..244a41b
--- /dev/null
+++ b/samples/risp4ml/pipeline/pipeline.h
@@ -0,0 +1,8 @@
+#ifndef SAMPLES_RISP4ML_PIPELINE_PIPELINE_H_
+#define SAMPLES_RISP4ML_PIPELINE_PIPELINE_H_
+
+#include "samples/risp4ml/common/image.h"
+
+void isp_pipeline(ImageU8* input, ImageU8* output);
+
+#endif  // SAMPLES_RISP4ML_PIPELINE_PIPELINE_H_
diff --git a/samples/risp4ml/test_data/faces_480x640_uint8_numpy_bayer.bin b/samples/risp4ml/test_data/faces_480x640_uint8_numpy_bayer.bin
new file mode 100644
index 0000000..3772000
--- /dev/null
+++ b/samples/risp4ml/test_data/faces_480x640_uint8_numpy_bayer.bin
Binary files differ