RISP4ML: further optimize memory allocation and usage

This change further optimizes memory allocation/usage for RISP4ML.

It enables in-place operations in RISP4ML stages whenever possible and minimizes use of "malloc". Unit tests are also updated accordingly.

The change has been verified by comparing output@plain-C risp4ml against output@google3. All unit tests are passed.

Change-Id: I93ec0c97f75c36be764bc65e4a886f725e0b2a9a
diff --git a/samples/risp4ml/isp_stages/blc.c b/samples/risp4ml/isp_stages/blc.c
index 26e36b5..61d8c47 100644
--- a/samples/risp4ml/isp_stages/blc.c
+++ b/samples/risp4ml/isp_stages/blc.c
@@ -6,23 +6,14 @@
 
 void set_blc_params(BlcParams* params) { blc_params = *params; }
 
-void blc_process(Image* input, Image* output) {
-  if (!blc_params.enable) {
-    *output = *input;
-    return;
-  }
+void blc_process(Image* img) {
+  if (!blc_params.enable) return;
 
-  uint16_t height = input->height;
-  uint16_t width = input->width;
-
-  for (uint16_t y = 0; y < height; ++y) {
-    const pixel_type_t* in_line = image_row(input, 0, y);
-    pixel_type_t* out_line = image_row(output, 0, y);
-
-    for (uint16_t x = 0; x < width; ++x) {
+  for (uint16_t y = 0; y < img->height; ++y) {
+    pixel_type_t* line = image_row(img, 0, y);
+    for (uint16_t x = 0; x < img->width; ++x) {
       BayerIndex bayer_index = GetBayerIndex(kBayerType, x, y);
-      out_line[x] =
-          SubUnsignedZeroClamp(in_line[x], blc_params.offsets[bayer_index]);
+      line[x] = SubUnsignedZeroClamp(line[x], blc_params.offsets[bayer_index]);
     }
   }
 }
diff --git a/samples/risp4ml/isp_stages/blc.h b/samples/risp4ml/isp_stages/blc.h
index 4ad1466..e4c9555 100644
--- a/samples/risp4ml/isp_stages/blc.h
+++ b/samples/risp4ml/isp_stages/blc.h
@@ -14,7 +14,7 @@
 
 void set_blc_params(BlcParams* params);
 
-void blc_process(Image* input, Image* output);
+void blc_process(Image* img);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/samples/risp4ml/isp_stages/blc_test.cc b/samples/risp4ml/isp_stages/blc_test.cc
index 1935048..ddd782b 100644
--- a/samples/risp4ml/isp_stages/blc_test.cc
+++ b/samples/risp4ml/isp_stages/blc_test.cc
@@ -10,11 +10,14 @@
  protected:
   void SetUp() override {
     in_ = image_new(1, kFrameSize, kFrameSize);
-    out_ = image_new(1, kFrameSize, kFrameSize);
     InitImageRandom(in_, 0, kRawPipelineMaxVal);
     // Force max/min values to be included.
     *image_pixel(in_, 0, 0, 0) = 0;
     *image_pixel(in_, 0, 0, 1) = kRawPipelineMaxVal;
+    out_ = image_new(1, kFrameSize, kFrameSize);
+    const uint32_t num_bytes =
+        in_->num_channels * in_->height * in_->width * sizeof(pixel_type_t);
+    memcpy(out_->data, in_->data, num_bytes);
   }
   void TearDown() override {
     image_delete(in_);
@@ -29,7 +32,7 @@
   BlcParams params = {.enable = false, .offsets = {20, 20, 20, 20}};
   set_blc_params(&params);
 
-  blc_process(in_, out_);
+  blc_process(out_);
 
   // Expect no change
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
@@ -47,7 +50,7 @@
   BlcParams params = {.enable = true, .offsets = {0, 0, 0, 0}};
   set_blc_params(&params);
 
-  blc_process(in_, out_);
+  blc_process(out_);
 
   // Expect no change
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
@@ -67,7 +70,7 @@
                                   kRawPipelineMaxVal, kRawPipelineMaxVal}};
   set_blc_params(&params);
 
-  blc_process(in_, out_);
+  blc_process(out_);
 
   // Expect correct subtraction
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
diff --git a/samples/risp4ml/isp_stages/dg.c b/samples/risp4ml/isp_stages/dg.c
index 73f7f46..fbb66b8 100644
--- a/samples/risp4ml/isp_stages/dg.c
+++ b/samples/risp4ml/isp_stages/dg.c
@@ -9,28 +9,20 @@
 
 void set_dg_params(DgParams* params) { dg_params = *params; }
 
-void dg_process(Image* input, Image* output) {
-  if (!dg_params.enable) {
-    *output = *input;
-    return;
-  }
+void dg_process(Image* img) {
+  if (!dg_params.enable) return;
 
-  uint16_t height = input->height;
-  uint16_t width = input->width;
+  for (uint16_t y = 0; y < img->height; ++y) {
+    pixel_type_t* line = image_row(img, 0, y);
 
-  for (uint16_t y = 0; y < height; ++y) {
-    const pixel_type_t* in_line = image_row(input, 0, y);
-    pixel_type_t* out_line = image_row(output, 0, y);
-
-    for (uint16_t x = 0; x < width; ++x) {
+    for (uint16_t x = 0; x < img->width; ++x) {
       BayerIndex bayer_index = GetBayerIndex(kBayerType, x, y);
-      uint16_t input_val = in_line[x];
       // + (1 << (kDgFractional -1)) adds 0.5 for more accurate rounding
-      uint32_t scaled_pixel =
-          input_val * dg_params.gains[bayer_index] + (1 << (kDgFractional - 1));
+      uint32_t scaled_pixel = (uint32_t)line[x] * dg_params.gains[bayer_index] +
+                              (1 << (kDgFractional - 1));
 
-      out_line[x] = (pixel_type_t)Clamp(scaled_pixel >> kDgFractional, 0,
-                                        kRawPipelineMaxVal);
+      line[x] = (pixel_type_t)Clamp(scaled_pixel >> kDgFractional, 0,
+                                    kRawPipelineMaxVal);
     }
   }
 }
diff --git a/samples/risp4ml/isp_stages/dg.h b/samples/risp4ml/isp_stages/dg.h
index 1a0bc1b..5979019 100644
--- a/samples/risp4ml/isp_stages/dg.h
+++ b/samples/risp4ml/isp_stages/dg.h
@@ -14,7 +14,7 @@
 
 void set_dg_params(DgParams* params);
 
-void dg_process(Image* input, Image* output);
+void dg_process(Image* img);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/samples/risp4ml/isp_stages/dg_test.cc b/samples/risp4ml/isp_stages/dg_test.cc
index cf3b7dc..41bf18e 100644
--- a/samples/risp4ml/isp_stages/dg_test.cc
+++ b/samples/risp4ml/isp_stages/dg_test.cc
@@ -12,6 +12,8 @@
   void SetUp() override {
     in_ = image_new(1, kFrameSize, kFrameSize);
     out_ = image_new(1, kFrameSize, kFrameSize);
+    num_bytes_ =
+        in_->num_channels * in_->height * in_->width * sizeof(pixel_type_t);
   }
   void TearDown() override {
     image_delete(in_);
@@ -20,17 +22,19 @@
 
   Image* in_;
   Image* out_;
+  uint32_t num_bytes_;
 };
 
 TEST_F(DgTest, Bypass) {
   InitImage(in_, 1);
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // Set gain to 2x, gain is in 8.16 format.
   uint16_t gain = 2 << kDgFractional;
   DgParams params = {.enable = false, .gains = {gain, gain, gain, gain}};
   set_dg_params(&params);
 
-  dg_process(in_, out_);
+  dg_process(out_);
 
   // Expect no change.
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
@@ -49,13 +53,14 @@
   // Force max/min values to be included.
   *image_pixel(in_, 0, 0, 0) = 0;
   *image_pixel(in_, 0, 0, 1) = kRawPipelineMaxVal;
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // Set gain to 1x, gain is in_ 8.8 format.
   uint16_t gain = 1 << kDgFractional;
   DgParams params = {.enable = true, .gains = {gain, gain, gain, gain}};
   set_dg_params(&params);
 
-  dg_process(in_, out_);
+  dg_process(out_);
 
   // Expect no change.
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
@@ -75,13 +80,14 @@
   // Force max/min values to be included.
   *image_pixel(in_, 0, 0, 0) = kRawPipelineMinVal / 2;
   *image_pixel(in_, 0, 0, 1) = kRawPipelineMaxVal / 2;
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // Set gain to 2x, gain is in_ 8.16 format.
   uint16_t gain = 2 << kDgFractional;
   DgParams params = {.enable = true, .gains = {gain, gain, gain, gain}};
   set_dg_params(&params);
 
-  dg_process(in_, out_);
+  dg_process(out_);
 
   // Expect all pixel values to be doubled, as no clipping.
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
@@ -97,13 +103,14 @@
 TEST_F(DgTest, ClampHighRandomPixel) {
   // Init image with range of values that will clamp to max with 2x gain.
   InitImageRandom(in_, kRawPipelineMaxVal / 2, kRawPipelineMaxVal);
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // Set gain to 2x, gain is in_ 8.16 format.
   uint16_t gain = 2 << kDgFractional;
   DgParams params = {.enable = true, .gains = {gain, gain, gain, gain}};
   set_dg_params(&params);
 
-  dg_process(in_, out_);
+  dg_process(out_);
 
   // Expect all pixel values to be clamped high.
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
@@ -118,13 +125,14 @@
 TEST_F(DgTest, ClampLowRandomPixel) {
   // Init image with range of values that will clamp to min with 2x gain.
   InitImageRandom(in_, kRawPipelineMinVal / 2, kRawPipelineMinVal);
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // Set gain to 2x, gain is in_ 8.8 format.
   uint16_t gain = 2 << kDgFractional;
   DgParams params = {.enable = true, .gains = {gain, gain, gain, gain}};
   set_dg_params(&params);
 
-  dg_process(in_, out_);
+  dg_process(out_);
 
   // Expect all pixel values to be clamped low.
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
@@ -147,6 +155,7 @@
       }
     }
   }
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // Set gain to max gain, 0xFFFF in_ 8.8 format, approximately 256.
   constexpr uint32_t kMaxGain =
@@ -155,7 +164,7 @@
                      .gains = {kMaxGain, kMaxGain, kMaxGain, kMaxGain}};
   set_dg_params(&params);
 
-  dg_process(in_, out_);
+  dg_process(out_);
 
   // Expect values 1, 2, ... 64 to be gained by 256, 128 and above to clip.
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
@@ -179,12 +188,13 @@
       }
     }
   }
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // Set gain to min non-zero gain, 0x000001 in_ 8.8 format, 1/256.
   DgParams params = {.enable = true, .gains = {1, 1, 1, 1}};
   set_dg_params(&params);
 
-  dg_process(in_, out_);
+  dg_process(out_);
 
   constexpr uint32_t kExpectedOutput[] = {0, 0, 0, 0, 0,  0,  0,  1,
                                           1, 2, 4, 8, 16, 32, 64, 128};
@@ -210,12 +220,13 @@
       }
     }
   }
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // Set gain to min useful gain, 0x000002 in_ 8.8 format, 1/256.
   DgParams params = {.enable = true, .gains = {2, 2, 2, 2}};
   set_dg_params(&params);
 
-  dg_process(in_, out_);
+  dg_process(out_);
 
   // Expect all output values < 128 to be zero, 128->1, 256->2.
   constexpr uint32_t kExpectedOutput[] = {0, 0, 0, 0,  0,  0,  1,   1,
@@ -232,12 +243,13 @@
 TEST_F(DgTest, ZeroGainRandomInput) {
   // Init image with range of values.
   InitImageRandom(in_, kRawPipelineMinVal, kRawPipelineMinVal);
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // Set gain to 0x, gain is in_ 8.16 format.
   DgParams params = {.enable = true, .gains = {0, 0, 0, 0}};
   set_dg_params(&params);
 
-  dg_process(in_, out_);
+  dg_process(out_);
 
   // Expect all output values to be zero.
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
diff --git a/samples/risp4ml/isp_stages/gamma.c b/samples/risp4ml/isp_stages/gamma.c
index f0e9f51..81cde21 100644
--- a/samples/risp4ml/isp_stages/gamma.c
+++ b/samples/risp4ml/isp_stages/gamma.c
@@ -27,60 +27,53 @@
 
 void set_gamma_params(GammaParams* params) { gamma_params = *params; }
 
-void gamma_process(Image* input, Image* output) {
-  uint16_t height = input->height;
-  uint16_t width = input->width;
+void gamma_process(Image* img) {
+  if (!gamma_params.enable) return;
 
-  const pixel_type_t* in_line[kRgbColorChannels];
-  pixel_type_t* out_line[kRgbColorChannels];
+  pixel_type_t* line[kRgbColorChannels];
 
-  for (uint16_t y = 0; y < height; ++y) {
+  for (uint16_t y = 0; y < img->height; ++y) {
     for (uint16_t c = 0; c < kRgbColorChannels; ++c) {
-      in_line[c] = image_row(input, c, y);
-      out_line[c] = image_row(output, c, y);
+      line[c] = image_row(img, c, y);
     }
 
-    for (uint16_t x = 0; x < width; ++x) {
+    for (uint16_t x = 0; x < img->width; ++x) {
       for (uint16_t c = 0; c < kRgbColorChannels; ++c) {
-        if (!gamma_params.enable) {
-          out_line[c][x] = in_line[c][x];
-        } else {
-          pixel_type_t pixel_val =
-              (pixel_type_t)Clamp(in_line[c][x], 0, kRgbPipelineMaxVal);
+        pixel_type_t pixel_val =
+            (pixel_type_t)Clamp(line[c][x], 0, kRgbPipelineMaxVal);
 
-          // Determine segment
-          int segment_index =
-              (kGammaNumberSegments - 1) -
-              ClzMsb(pixel_val, kRgbPipelineBpp, kGammaNumberSegments - 1);
-          uint16_t segment_left =
-              segment_index ? 1 << (kRgbPipelineBpp -
-                                    kGammaLogSegmentOffsets[segment_index])
-                            : 0;
+        // Determine segment
+        int segment_index =
+            (kGammaNumberSegments - 1) -
+            ClzMsb(pixel_val, kRgbPipelineBpp, kGammaNumberSegments - 1);
+        uint16_t segment_left =
+            segment_index ? 1 << (kRgbPipelineBpp -
+                                  kGammaLogSegmentOffsets[segment_index])
+                          : 0;
 
-          // Bin index
-          int bin_index = ((pixel_val - segment_left) >>
-                           kGammaLogSegmentSpacing[segment_index]) +
-                          kGammaSegmentLutOffset[segment_index];
+        // Bin index
+        int bin_index = ((pixel_val - segment_left) >>
+                         kGammaLogSegmentSpacing[segment_index]) +
+                        kGammaSegmentLutOffset[segment_index];
 
-          int offset_within_bin =
-              (pixel_val - segment_left) &
-              ((1 << kGammaLogSegmentSpacing[segment_index]) - 1);
+        int offset_within_bin =
+            (pixel_val - segment_left) &
+            ((1 << kGammaLogSegmentSpacing[segment_index]) - 1);
 
-          uint16_t l_val = gamma_params.lut[bin_index];
-          uint16_t r_val = gamma_params.lut[bin_index + 1];
+        uint16_t l_val = gamma_params.lut[bin_index];
+        uint16_t r_val = gamma_params.lut[bin_index + 1];
 
-          uint16_t bin_size = 1 << kGammaLogSegmentSpacing[segment_index];
+        uint16_t bin_size = 1 << kGammaLogSegmentSpacing[segment_index];
 
-          uint32_t lerp_val = (l_val * (bin_size - offset_within_bin) +
-                               r_val * offset_within_bin + (bin_size >> 1)) >>
-                              kGammaLogSegmentSpacing[segment_index];
+        uint32_t lerp_val = (l_val * (bin_size - offset_within_bin) +
+                             r_val * offset_within_bin + (bin_size >> 1)) >>
+                            kGammaLogSegmentSpacing[segment_index];
 
-          // Clamping is not requied
-          // TODO(alexkaplan): The comment above is from gChips source.
-          // this calc needs to be checked carefully:
-          //
-          out_line[c][x] = (pixel_type_t)lerp_val;
-        }
+        // Clamping is not requied
+        // TODO(alexkaplan): The comment above is from gChips source.
+        // this calc needs to be checked carefully:
+        //
+        line[c][x] = (pixel_type_t)lerp_val;
       }
     }
   }
diff --git a/samples/risp4ml/isp_stages/gamma.h b/samples/risp4ml/isp_stages/gamma.h
index ad6d61e..4ef3777 100644
--- a/samples/risp4ml/isp_stages/gamma.h
+++ b/samples/risp4ml/isp_stages/gamma.h
@@ -16,7 +16,7 @@
 
 void set_gamma_params(GammaParams* params);
 
-void gamma_process(Image* input, Image* output);
+void gamma_process(Image* img);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/samples/risp4ml/isp_stages/gamma_test.cc b/samples/risp4ml/isp_stages/gamma_test.cc
index 9adc32c..cbf4a94 100644
--- a/samples/risp4ml/isp_stages/gamma_test.cc
+++ b/samples/risp4ml/isp_stages/gamma_test.cc
@@ -33,7 +33,6 @@
  protected:
   void setup(uint16_t width) {
     in_ = image_new(3, 2, width);
-    out_ = image_new(3, 2, width);
     for (uint16_t c = 0; c < in_->num_channels; ++c) {
       for (uint16_t y = 0; y < in_->height; ++y) {
         for (uint16_t x = 0; x < in_->width; ++x) {
@@ -41,6 +40,10 @@
         }
       }
     }
+    out_ = image_new(3, 2, width);
+    const uint32_t num_bytes =
+        in_->num_channels * in_->height * in_->width * sizeof(pixel_type_t);
+    memcpy(out_->data, in_->data, num_bytes);
   }
   void TearDown() override {
     image_delete(in_);
@@ -63,7 +66,7 @@
 
   set_gamma_params(&params);
 
-  gamma_process(in_, out_);
+  gamma_process(out_);
 
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
     for (uint16_t y = 0; y < in_->height; ++y) {
@@ -81,7 +84,7 @@
 
   set_gamma_params(&linear_params);
 
-  gamma_process(in_, out_);
+  gamma_process(out_);
 
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
     for (uint16_t y = 0; y < in_->height; ++y) {
@@ -100,7 +103,7 @@
 
   set_gamma_params(&rgb_params);
 
-  gamma_process(in_, out_);
+  gamma_process(out_);
 
   for (uint16_t c = 0; c < in_->num_channels; ++c) {
     for (uint16_t y = 0; y < in_->height; ++y) {
diff --git a/samples/risp4ml/isp_stages/wbg.c b/samples/risp4ml/isp_stages/wbg.c
index 3e84fb2..f38135a 100644
--- a/samples/risp4ml/isp_stages/wbg.c
+++ b/samples/risp4ml/isp_stages/wbg.c
@@ -15,13 +15,8 @@
 
 void set_wbg_params(WbgParams* params) { wbg_params = *params; }
 
-static void compute_wbg_gain(Image* input) {
+static void compute_wbg_gain(Image* img) {
   // Calculate the white-balance gain values using the "gray world" algorithm
-  uint16_t height = input->height;
-  uint16_t width = input->width;
-
-  pixel_type_t* in_line;
-
   int64_t sum_of_reds = 0;
   uint32_t num_of_reds = 0;
   // will use only one of the greens for scaling, since the difference between
@@ -31,25 +26,25 @@
   int64_t sum_of_blues = 0;
   uint32_t num_of_blues = 0;
 
-  for (uint16_t y = 0; y < height; ++y) {
-    in_line = image_row(input, 0, y);
-    for (uint16_t x = 0; x < width; ++x) {
+  for (uint16_t y = 0; y < img->height; ++y) {
+    pixel_type_t* line = image_row(img, 0, y);
+    for (uint16_t x = 0; x < img->width; ++x) {
       BayerIndex bayer_index = GetBayerIndex(kBayerType, x, y);
       switch (bayer_index) {
         case (kR): {
-          sum_of_reds += in_line[x];
+          sum_of_reds += line[x];
           num_of_reds++;
         }; break;
         case (kGr): {
-          sum_of_greens += in_line[x];
+          sum_of_greens += line[x];
           num_of_greens++;
         }; break;
         case (kGb): {
-          sum_of_greens += in_line[x];
+          sum_of_greens += line[x];
           num_of_greens++;
         }; break;
         case (kB): {
-          sum_of_blues += in_line[x];
+          sum_of_blues += line[x];
           num_of_blues++;
         }; break;
         default: {
@@ -83,34 +78,23 @@
   wbg_params.gains[3] = blue_wb;
 }
 
-void wbg_process(Image* input, Image* output) {
-  if (!wbg_params.enable) {
-    *output = *input;
-    return;
-  }
-
-  uint16_t height = input->height;
-  uint16_t width = input->width;
-
-  const pixel_type_t* in_line;
-  pixel_type_t* out_line;
-
+void wbg_process(Image* img) {
+  if (!wbg_params.enable) return;
   if (!wbg_params.fixed) {
-    compute_wbg_gain(input);
+    compute_wbg_gain(img);
   }
 
-  for (uint16_t y = 0; y < height; ++y) {
-    in_line = image_row(input, 0, y);
-    out_line = image_row(output, 0, y);
+  for (uint16_t y = 0; y < img->height; ++y) {
+    pixel_type_t* line = image_row(img, 0, y);
 
-    for (uint16_t x = 0; x < width; ++x) {
+    for (uint16_t x = 0; x < img->width; ++x) {
       BayerIndex bayer_index = GetBayerIndex(kBayerType, x, y);
-      uint32_t input_val = in_line[x];
+      uint32_t input_val = (uint32_t)line[x];
       uint32_t scaled_pixel = (input_val * wbg_params.gains[bayer_index] +
                                (1 << (kWbgFractional - 1))) >>
                               kWbgFractional;
-      out_line[x] = (pixel_type_t)Clamp(scaled_pixel, kRawPipelineMinVal,
-                                        kRawPipelineMaxVal);
+      line[x] = (pixel_type_t)Clamp(scaled_pixel, kRawPipelineMinVal,
+                                    kRawPipelineMaxVal);
     }
   }
 }
diff --git a/samples/risp4ml/isp_stages/wbg.h b/samples/risp4ml/isp_stages/wbg.h
index 2af1b84..8c8d9b4 100644
--- a/samples/risp4ml/isp_stages/wbg.h
+++ b/samples/risp4ml/isp_stages/wbg.h
@@ -15,7 +15,7 @@
 
 void set_wbg_params(WbgParams* params);
 
-void wbg_process(Image* input, Image* output);
+void wbg_process(Image* img);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/samples/risp4ml/isp_stages/wbg_test.cc b/samples/risp4ml/isp_stages/wbg_test.cc
index 012a5c7..52695a0 100644
--- a/samples/risp4ml/isp_stages/wbg_test.cc
+++ b/samples/risp4ml/isp_stages/wbg_test.cc
@@ -13,6 +13,8 @@
   void SetUp() override {
     in_ = image_new(1, kFrameHeight, kFrameWidth);
     out_ = image_new(1, kFrameHeight, kFrameWidth);
+    num_bytes_ =
+        in_->num_channels * in_->height * in_->width * sizeof(pixel_type_t);
   }
   void TearDown() override {
     image_delete(in_);
@@ -21,11 +23,13 @@
 
   Image* in_;
   Image* out_;
+  uint32_t num_bytes_;
 };
 
 TEST_F(WbgTest, IdentityTest) {
   // Use a grey input image.
   InitImageRandom(in_, kRawPipelineMinVal, kRawPipelineMaxVal);
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // set the params to something boring.
   uint32_t gain = 1 << kWbgFractional;
@@ -33,7 +37,7 @@
       .enable = true, .fixed = true, .gains = {gain, gain, gain, gain}};
   set_wbg_params(&params);
 
-  wbg_process(in_, out_);
+  wbg_process(out_);
   for (uint16_t y = 0; y < kFrameHeight; y++) {
     for (uint16_t x = 0; x < kFrameWidth; x++) {
       ASSERT_EQ(image_pixel_val(in_, 0, y, x), image_pixel_val(out_, 0, y, x));
@@ -45,6 +49,7 @@
   // Use a grey input image.
   constexpr pixel_type_t kPixelVal = 1 << kWbgFractional;
   InitImage(in_, kPixelVal);
+  memcpy(out_->data, in_->data, num_bytes_);
 
   // set the params to something boring.
   WbgParams params = {.enable = true,
@@ -52,7 +57,7 @@
                       .gains = {kPixelVal, kPixelVal, kPixelVal, kPixelVal}};
   set_wbg_params(&params);
 
-  wbg_process(in_, out_);
+  wbg_process(out_);
 
   for (uint16_t c1 = 0; c1 < 2; ++c1) {
     for (uint16_t c2 = 0; c2 < 2; ++c2) {
@@ -70,13 +75,14 @@
   // Use a grey input image.
   constexpr pixel_type_t kPixelValHi = kRawPipelineMaxVal - 10;
   InitImage(in_, kPixelValHi);
+  memcpy(out_->data, in_->data, num_bytes_);
 
   uint32_t gain = 2 * (1 << kWbgFractional);
   WbgParams params = {
       .enable = true, .fixed = true, .gains = {gain, gain, gain, gain}};
   set_wbg_params(&params);
 
-  wbg_process(in_, out_);
+  wbg_process(out_);
 
   for (uint16_t y = 0; y < kFrameHeight; y++) {
     for (uint16_t x = 0; x < kFrameWidth; x++) {
diff --git a/samples/risp4ml/pipeline/pipeline.c b/samples/risp4ml/pipeline/pipeline.c
index ec561b2..bca482d 100644
--- a/samples/risp4ml/pipeline/pipeline.c
+++ b/samples/risp4ml/pipeline/pipeline.c
@@ -8,36 +8,35 @@
 #include "samples/risp4ml/pipeline/pipeline.h"
 
 void isp_pipeline(ImageU8 *input, ImageU8 *output) {
-  Image *input_image =
+  Image *img_bayer =
       image_new(input->num_channels, input->height, input->width);
   // shift the 8bits wide input to 16bits (the processing pipeline bitwidth)
-  uint32_t input_dimensions =
+  const uint32_t input_dimensions =
       input->num_channels * input->height * input->width;
   for (uint32_t i = 0; i < input_dimensions; ++i) {
-    input_image->data[i] = input->data[i] << kRawPipelineFraction;  // 8
+    img_bayer->data[i] = input->data[i] << kRawPipelineFraction;  // 8
   }
 
-  Image *image1 = image_new(input->num_channels, input->height, input->width);
-  blc_process(input_image, image1);
-  image_delete(input_image);
+  // black level offset (in-place)
+  blc_process(img_bayer);
 
-  Image *image2 = image_new(input->num_channels, input->height, input->width);
-  dg_process(image1, image2);
-  image_delete(image1);
+  // digital gain (in-place)
+  dg_process(img_bayer);
 
-  Image *image3 = image_new(input->num_channels, input->height, input->width);
-  wbg_process(image2, image3);
-  image_delete(image2);
+  // white balance gain (in-place)
+  wbg_process(img_bayer);
 
-  Image *image4 = image_new(output->num_channels, input->height, input->width);
-  demosaic_process(image3, image4);
-  image_delete(image3);
+  // demosaic
+  Image *img_color =
+      image_new(output->num_channels, input->height, input->width);
+  demosaic_process(img_bayer, img_color);
+  image_delete(img_bayer);
 
-  Image *image5 = image_new(output->num_channels, input->height, input->width);
-  gamma_process(image4, image5);
-  image_delete(image4);
+  // gamma correction (in-place)
+  gamma_process(img_color);
 
-  set_downscale_factor(image5, output);
-  downscale_process(image5, output);
-  image_delete(image5);
+  // downscaler
+  set_downscale_factor(img_color, output);
+  downscale_process(img_color, output);
+  image_delete(img_color);
 }