Add computer vision tests

Change-Id: Id06ec3d8dcb0830d20bcd78962d089996f83acb6
diff --git a/build_tools/bazel/kelvin.bzl b/build_tools/bazel/kelvin.bzl
index bdbe612..03c1756 100644
--- a/build_tools/bazel/kelvin.bzl
+++ b/build_tools/bazel/kelvin.bzl
@@ -195,6 +195,7 @@
         hw_test_size = "medium",
         hw_test_tags = [],
         iss_test_size = "small",
+        iss_test_tags = [],
         **kwargs):
     """A sh_test wrapper for kelvin binaries
 
@@ -203,9 +204,10 @@
 
     Args:
       name: The name of this rule.
-      iss_test_size: ISS test size. Default to small.
       hw_test_size: Tests size for SystemC test, default to medium.
       hw_test_tags: Test tags passed to System test.
+      iss_test_size: ISS test size. Default to small.
+      iss_test_tags: Test tags passed to ISS test.
       **kwargs: Agruments that will be forwarded to kelvin_binary
     """
 
@@ -228,6 +230,7 @@
         data = [
             "{}.elf".format(kelvin_elf),
         ],
+        tags = ["iss"] + iss_test_tags,
     )
 
     hw_test = "{}_hw".format(name)
@@ -241,7 +244,7 @@
         data = [
             "{}.bin".format(kelvin_elf),
         ],
-        tags = hw_test_tags,
+        tags = ["systemc"] + hw_test_tags,
     )
 
     native.test_suite(
diff --git a/tests/cv/BUILD b/tests/cv/BUILD
new file mode 100644
index 0000000..5b8652e
--- /dev/null
+++ b/tests/cv/BUILD
@@ -0,0 +1,105 @@
+# Copyright 2023 Google LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+# Computer vision examples.
+
+load("//build_tools/bazel:kelvin.bzl", "kelvin_test")
+
+cc_library(
+    name = "test_helper",
+    hdrs = [
+        "test_helper.h",
+    ],
+)
+
+kelvin_test(
+    name = "diff_test",
+    srcs = [
+        "diff.cc",
+        "diff_test.cc",
+    ],
+    hdrs = [
+        "diff.h",
+    ],
+    deps = [
+        ":test_helper",
+        "//crt:crt_header",
+    ],
+)
+
+kelvin_test(
+    name = "downsample_test",
+    srcs = [
+        "downsample.cc",
+        "downsample_test.cc",
+    ],
+    hdrs = [
+        "downsample.h",
+    ],
+    hw_test_size = "small",
+    deps = [
+        ":test_helper",
+        "//crt:crt_header",
+    ],
+)
+
+kelvin_test(
+    name = "extrema_test",
+    srcs = [
+        "extrema.cc",
+        "extrema_test.cc",
+    ],
+    hdrs = [
+        "extrema.h",
+    ],
+    deps = [
+        ":test_helper",
+        "//crt:crt_header",
+    ],
+)
+
+kelvin_test(
+    name = "gaussian_test",
+    srcs = [
+        "gaussian.cc",
+        "gaussian_test.cc",
+    ],
+    hdrs = [
+        "gaussian.h",
+    ],
+    deps = [
+        ":test_helper",
+        "//crt:crt_header",
+    ],
+)
+
+kelvin_test(
+    name = "shift_gaussian_test",
+    srcs = [
+        "shift_gaussian.cc",
+        "shift_gaussian_test.cc",
+    ],
+    hdrs = [
+        "shift_gaussian.h",
+    ],
+    deps = [
+        ":test_helper",
+        "//crt:crt_header",
+    ],
+)
+
+kelvin_test(
+    name = "upsample_test",
+    srcs = [
+        "upsample.cc",
+        "upsample_test.cc",
+    ],
+    hdrs = [
+        "upsample.h",
+    ],
+    hw_test_size = "small",
+    deps = [
+        ":test_helper",
+        "//crt:crt_header",
+    ],
+)
diff --git a/tests/cv/diff.cc b/tests/cv/diff.cc
new file mode 100644
index 0000000..db39296
--- /dev/null
+++ b/tests/cv/diff.cc
@@ -0,0 +1,120 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/diff.h"
+
+#include <cstdint>
+
+#include "crt/kelvin.h"
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+namespace kelvin::cv {
+
+void diff(int num_cols, const uint16_t* input0_row, const uint16_t* input1_row,
+          uint16_t* output_row) {
+  int vl;
+  int n = num_cols;
+  do {
+    getvl_h_x_m(vl, n);
+    n -= vl;
+    vld_h_lp_xx_m(vm1, input0_row, vl);
+    vld_h_lp_xx_m(vm2, input1_row, vl);
+    vsub_h_vv_m(vm0, vm1, vm2);
+    vst_h_lp_xx_m(vm0, output_row, vl);
+  } while (n > 0);
+}
+
+void diffp(int num_cols, const uint16_t* input0_row, const uint16_t* input1_row,
+           uint16_t* output_row) {
+  int vl_0, vl_1;
+  int n = num_cols;
+
+  // [0] load
+  getvl_h_x_m(vl_0, n);
+  n -= vl_0;
+  vld_h_lp_xx_m(v4, input0_row, vl_0);
+  vld_h_lp_xx_m(v8, input1_row, vl_0);
+
+  while (true) {
+    // [1] load
+    getvl_h_x_m(vl_1, n);
+    n -= vl_1;
+    vld_h_lp_xx_m(v20, input0_row, vl_1);
+    vld_h_lp_xx_m(v24, input1_row, vl_1);
+
+    // [0] store
+    vsub_h_vv_m(v0, v4, v8);
+    vst_h_lp_xx_m(v0, output_row, vl_0);
+    if (unlikely(!vl_1)) break;
+
+    // [0] load
+    getvl_h_x_m(vl_0, n);
+    n -= vl_0;
+    vld_h_lp_xx_m(v4, input0_row, vl_0);
+    vld_h_lp_xx_m(v8, input1_row, vl_0);
+
+    // [1] store
+    vsub_h_vv_m(v16, v20, v24);
+    vst_h_lp_xx_m(v16, output_row, vl_1);
+    if (unlikely(!vl_0)) break;
+  }
+}
+
+void diff4(int num_cols, int stride, const uint16_t* input0_row,
+           const uint16_t* input1_row, uint16_t* output_row) {
+  int vl;
+  int n = num_cols;
+  do {
+    getvl_h_x(vl, n);
+    n -= vl;
+    vld_h_tp_xx_m(v4, input0_row, stride);
+    vld_h_tp_xx_m(v8, input1_row, stride);
+    vsub_h_vv_m(v0, v4, v8);
+    vst_h_tp_xx_m(v0, output_row, stride);
+  } while (n > 0);
+}
+
+void diff4p(int num_cols, int stride, const uint16_t* input0_row,
+            const uint16_t* input1_row, uint16_t* output_row) {
+  int vl_0, vl_1;
+  int n = num_cols;
+
+  // [0] load
+  getvl_h_x(vl_0, n);
+  n -= vl_0;
+  vld_h_tp_xx_m(v4, input0_row, stride);
+  vld_h_tp_xx_m(v8, input1_row, stride);
+
+  while (true) {
+    // [1] load
+    getvl_h_x(vl_1, n);
+    n -= vl_1;
+    if (likely(vl_1)) {
+      vld_h_tp_xx_m(v20, input0_row, stride);
+      vld_h_tp_xx_m(v24, input1_row, stride);
+    }
+
+    // [0] store
+    vsub_h_vv_m(v0, v4, v8);
+    vst_h_tp_xx_m(v0, output_row, stride);
+    if (unlikely(!vl_1)) break;
+
+    // [0] load
+    getvl_h_x(vl_0, n);
+    n -= vl_0;
+    if (likely(vl_0)) {
+      vld_h_tp_xx_m(v4, input0_row, stride);
+      vld_h_tp_xx_m(v8, input1_row, stride);
+    }
+
+    // [1] store
+    vsub_h_vv_m(v16, v20, v24);
+    vst_h_tp_xx_m(v16, output_row, stride);
+    if (unlikely(!vl_0)) break;
+  }
+}
+
+};  // namespace kelvin::cv
diff --git a/tests/cv/diff.h b/tests/cv/diff.h
new file mode 100644
index 0000000..19a9c1b
--- /dev/null
+++ b/tests/cv/diff.h
@@ -0,0 +1,33 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TESTS_CV_DIFF_H_
+#define TESTS_CV_DIFF_H_
+
+#include <cstdint>
+
+namespace kelvin::cv {
+
+// DUT: diff.cc
+// REF: diff_test.cc
+
+// Stripmine horizontally one line.
+void diff(int num_cols, const uint16_t* input0_row, const uint16_t* input1_row,
+          uint16_t* output_row);
+
+// Stripmine horizontally one line with stage pipelining.
+void diffp(int num_cols, const uint16_t* input0_row, const uint16_t* input1_row,
+           uint16_t* output_row);
+
+// Stripmine vertically four lines.
+void diff4(int num_cols, int stride, const uint16_t* input0_row,
+           const uint16_t* input1_row, uint16_t* output_row);
+
+// Stripmine vertically four lines with stage pipelining.
+void diff4p(int num_cols, int stride, const uint16_t* input0_row,
+            const uint16_t* input1_row, uint16_t* output_row);
+
+};  // namespace kelvin::cv
+
+#endif  // TESTS_CV_DIFF_H_
diff --git a/tests/cv/diff_test.cc b/tests/cv/diff_test.cc
new file mode 100644
index 0000000..20d289e
--- /dev/null
+++ b/tests/cv/diff_test.cc
@@ -0,0 +1,97 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/diff.h"
+
+#include <cstdint>
+
+#include "crt/kelvin.h"
+#include "tests/cv/test_helper.h"
+
+void diff_h_test() {
+  constexpr int kNumCol = 640;
+  uint16_t input0_row[kNumCol] __attribute__((aligned(64)));
+  uint16_t input1_row[kNumCol] __attribute__((aligned(64)));
+  uint16_t output_row[kNumCol] __attribute__((aligned(64)));
+  krand(kNumCol, input0_row);
+  krand(kNumCol, input1_row);
+
+  kelvin::cv::diff(kNumCol, input0_row, input1_row, output_row);
+
+  for (int i = 0; i < kNumCol; ++i) {
+    const uint16_t ref_value = input0_row[i] - input1_row[i];
+    if (ref_value != output_row[i]) {
+      printf("**error::diff_h_test[%d] %x %x\n", i, ref_value, output_row[i]);
+      exit(1);
+    }
+  }
+}
+
+void diff_hp_test() {
+  constexpr int kNumCol = 640;
+  uint16_t input0_row[kNumCol] __attribute__((aligned(64)));
+  uint16_t input1_row[kNumCol] __attribute__((aligned(64)));
+  uint16_t output_row[kNumCol] __attribute__((aligned(64)));
+  krand(kNumCol, input0_row);
+  krand(kNumCol, input1_row);
+
+  kelvin::cv::diffp(kNumCol, input0_row, input1_row, output_row);
+
+  for (int i = 0; i < kNumCol; ++i) {
+    const uint16_t ref_value = input0_row[i] - input1_row[i];
+    if (ref_value != output_row[i]) {
+      printf("**error::diff_hp_test[%d,%d] %x %x\n", i / kNumCol, i % kNumCol,
+             ref_value, output_row[i]);
+      exit(1);
+    }
+  }
+}
+
+void diff_v_test() {
+  constexpr int kNumCol = 640;
+  uint16_t input0_row[kNumCol * 4] __attribute__((aligned(64)));
+  uint16_t input1_row[kNumCol * 4] __attribute__((aligned(64)));
+  uint16_t output_row[kNumCol * 4] __attribute__((aligned(64)));
+  krand(kNumCol * 4, input0_row);
+  krand(kNumCol * 4, input1_row);
+
+  kelvin::cv::diff4(kNumCol, kNumCol, input0_row, input1_row, output_row);
+
+  for (int i = 0; i < kNumCol * 4; ++i) {
+    const uint16_t ref_value = input0_row[i] - input1_row[i];
+    if (ref_value != output_row[i]) {
+      printf("**error::diff_v_test[%d,%d] %x %x\n", i / kNumCol, i % kNumCol,
+             ref_value, output_row[i]);
+      exit(1);
+    }
+  }
+}
+
+void diff_vp_test() {
+  constexpr int kNumCol = 640;
+  uint16_t input0_row[kNumCol * 4] __attribute__((aligned(64)));
+  uint16_t input1_row[kNumCol * 4] __attribute__((aligned(64)));
+  uint16_t output_row[kNumCol * 4] __attribute__((aligned(64)));
+  krand(kNumCol * 4, input0_row);
+  krand(kNumCol * 4, input1_row);
+
+  kelvin::cv::diff4p(kNumCol, kNumCol, input0_row, input1_row, output_row);
+
+  for (int i = 0; i < kNumCol * 4; ++i) {
+    const uint16_t ref_value = input0_row[i] - input1_row[i];
+    if (ref_value != output_row[i]) {
+      printf("**error::diff_vp_test[%d,%d] %x %x\n", i / kNumCol, i % kNumCol,
+             ref_value, output_row[i]);
+      exit(1);
+    }
+  }
+}
+
+int main() {
+  diff_h_test();
+  diff_hp_test();
+  diff_v_test();
+  diff_vp_test();
+  return 0;
+}
diff --git a/tests/cv/downsample.cc b/tests/cv/downsample.cc
new file mode 100644
index 0000000..eaf9bb6
--- /dev/null
+++ b/tests/cv/downsample.cc
@@ -0,0 +1,46 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/downsample.h"
+
+#include <cstdint>
+
+#include "crt/kelvin.h"
+
+namespace kelvin::cv {
+
+void downsample(int num_output_cols, const uint16_t* input0_row,
+                const uint16_t* input1_row, uint16_t* output_row) {
+  int vl_input_0, vl_input_1, vl_output;
+  int m = 2 * num_output_cols;
+  int n = num_output_cols;
+  while (n > 0) {
+    getvl_h_x_m(vl_input_0, m);
+    m -= vl_input_0;
+    getvl_h_x_m(vl_input_1, m);
+    m -= vl_input_1;
+    getvl_h_x_m(vl_output, n);
+    n -= vl_output;
+
+    vld_h_lp_xx_m(vm12, input0_row, vl_input_0);
+    vld_h_lp_xx_m(vm13, input0_row, vl_input_1);
+    vld_h_lp_xx_m(vm14, input1_row, vl_input_0);
+    vld_h_lp_xx_m(vm15, input1_row, vl_input_1);
+
+    vpadd_w_u_v_m(vm8, vm12);
+    vpadd_w_u_v_m(vm9, vm13);
+    vpadd_w_u_v_m(vm10, vm14);
+    vpadd_w_u_v_m(vm11, vm15);
+    vadd_w_vv_m(vm6, vm8, vm10);
+    vadd_w_vv_m(vm7, vm9, vm11);
+
+    vevnodd_w_vv_m(vm4, vm6, vm7);
+
+    vsransu_h_r_vx_m(vm0, vm4, 2);
+
+    vst_h_lp_xx_m(vm0, output_row, vl_output);
+  }
+}
+
+};  // namespace kelvin::cv
diff --git a/tests/cv/downsample.h b/tests/cv/downsample.h
new file mode 100644
index 0000000..5aecb16
--- /dev/null
+++ b/tests/cv/downsample.h
@@ -0,0 +1,20 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TESTS_CV_DOWNSAMPLE_H_
+#define TESTS_CV_DOWNSAMPLE_H_
+
+#include <cstdint>
+
+namespace kelvin::cv {
+
+// DUT: downsample.cc
+// REF: downsample_test.cc
+
+void downsample(int num_output_cols, const uint16_t* input0_row,
+                const uint16_t* input1_row, uint16_t* output_row);
+
+};  // namespace kelvin::cv
+
+#endif  // TESTS_CV_DOWNSAMPLE_H_
diff --git a/tests/cv/downsample_test.cc b/tests/cv/downsample_test.cc
new file mode 100644
index 0000000..a1afa0b
--- /dev/null
+++ b/tests/cv/downsample_test.cc
@@ -0,0 +1,40 @@
+
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/downsample.h"
+
+#include <cstdint>
+
+#include "crt/kelvin.h"
+#include "tests/cv/test_helper.h"
+
+void downsample_test() {
+  constexpr int kNumInputCols = 640;
+  constexpr int kNumOutputCols = kNumInputCols / 2;
+  uint16_t input0_row[kNumInputCols] __attribute__((aligned(64)));
+  uint16_t input1_row[kNumInputCols] __attribute__((aligned(64)));
+  uint16_t output_row[kNumOutputCols] __attribute__((aligned(64)));
+  krand(kNumInputCols, input0_row);
+  krand(kNumInputCols, input1_row);
+
+  kelvin::cv::downsample(kNumOutputCols, input0_row, input1_row, output_row);
+
+  for (int i = 0; i < kNumOutputCols; ++i) {
+    const uint32_t s0 = input0_row[2 * i] + input0_row[2 * i + 1];
+    const uint32_t s1 = input1_row[2 * i] + input1_row[2 * i + 1];
+    const uint32_t s012 = s0 + s1 + 2;
+    const uint16_t ref_value = s012 >> 2;
+    if (ref_value != output_row[i]) {
+      printf("**error::downsample_test[%d] %x %x\n", i, ref_value,
+             output_row[i]);
+      exit(-1);
+    }
+  }
+}
+
+int main() {
+  downsample_test();
+  return 0;
+}
diff --git a/tests/cv/extrema.cc b/tests/cv/extrema.cc
new file mode 100644
index 0000000..e4ade0d
--- /dev/null
+++ b/tests/cv/extrema.cc
@@ -0,0 +1,220 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/extrema.h"
+
+#include <cstdint>
+
+#include "crt/kelvin.h"
+
+namespace kelvin::cv {
+
+void extrema(int num_cols, const int16_t* input[4][3], uint8_t* output0,
+             uint8_t* output1) {
+#define prev00 v0
+#define prev01 v1
+#define prev02 v2
+#define prev10 v3
+#define prev11 v4
+#define prev12 v5
+#define prev20 v6
+#define prev21 v7
+#define prev22 v8
+#define prev30 v9
+#define prev31 v10
+#define prev32 v11
+#define curr00 v12
+#define curr01 v13
+#define curr02 v14
+#define curr10 v15
+#define curr11 v16
+#define curr12 v17
+#define curr20 v18
+#define curr21 v19
+#define curr22 v20
+#define curr30 v21
+#define curr31 v22
+#define curr32 v23
+#define next00 v24
+#define next01 v25
+#define next02 v26
+#define next10 v27
+#define next11 v28
+#define next12 v29
+#define next20 v30
+#define next21 v31
+#define next22 v32
+#define next30 v33
+#define next31 v34
+#define next32 v35
+#define elem v36
+#define tmin0 v37
+#define tmax0 v38
+#define tmin1 v39
+#define tmax1 v40
+#define rmin v41
+#define rmax v42
+#define value0 v43
+#define value1 v44
+#define result0 v45
+#define result1 v46
+
+  int16_t* ptr0 = const_cast<int16_t*>(input[0][0]);
+  int16_t* ptr1 = const_cast<int16_t*>(input[0][1]);
+  int16_t* ptr2 = const_cast<int16_t*>(input[0][2]);
+  int16_t* ptr3 = const_cast<int16_t*>(input[1][0]);
+  int16_t* ptr4 = const_cast<int16_t*>(input[1][1]);
+  int16_t* ptr5 = const_cast<int16_t*>(input[1][2]);
+  int16_t* ptr6 = const_cast<int16_t*>(input[2][0]);
+  int16_t* ptr7 = const_cast<int16_t*>(input[2][1]);
+  int16_t* ptr8 = const_cast<int16_t*>(input[2][2]);
+  int16_t* ptr9 = const_cast<int16_t*>(input[3][0]);
+  int16_t* ptra = const_cast<int16_t*>(input[3][1]);
+  int16_t* ptrb = const_cast<int16_t*>(input[3][2]);
+
+  uint8_t* out0 = const_cast<uint8_t*>(output0);
+  uint8_t* out1 = const_cast<uint8_t*>(output1);
+
+  vld_h_p_x(curr00, ptr0);
+  vld_h_p_x(curr01, ptr1);
+  vld_h_p_x(curr02, ptr2);
+  vld_h_p_x(curr10, ptr3);
+  vld_h_p_x(curr11, ptr4);
+  vld_h_p_x(curr12, ptr5);
+  vld_h_p_x(curr20, ptr6);
+  vld_h_p_x(curr21, ptr7);
+  vld_h_p_x(curr22, ptr8);
+  vld_h_p_x(curr30, ptr9);
+  vld_h_p_x(curr31, ptra);
+  vld_h_p_x(curr32, ptrb);
+
+  int vlenh;
+  getmaxvl_h(vlenh);
+
+  for (int i = 0; i < num_cols; i += vlenh) {
+    // Extrema compute.
+#define minmax_p(param0, param1, param2)                            \
+  vslidep_h_1_vv(elem, prev##param1##param2, curr##param1##param2); \
+  vmin_h_vv(tmin##param0, tmin##param0, elem);                      \
+  vmax_h_vv(tmax##param0, tmax##param0, elem);
+
+#define minmax_n(param0, param1, param2)                            \
+  vsliden_h_1_vv(elem, curr##param1##param2, next##param1##param2); \
+  vmin_h_vv(tmin##param0, tmin##param0, elem);                      \
+  vmax_h_vv(tmax##param0, tmax##param0, elem);
+
+#define minmax_c(param0, param1, param2)                       \
+  vmin_h_vv(tmin##param0, tmin##param0, prev##param1##param2); \
+  vmax_h_vv(tmax##param0, tmax##param0, prev##param1##param2);
+
+    // Common centers.
+    vmin_h_vv(tmin0, curr10, curr12);
+    vmax_h_vv(tmax0, curr10, curr12);
+    vmin_h_vv(tmin0, tmin0, curr20);
+    vmax_h_vv(tmax0, tmax0, curr20);
+    vmin_h_vv(tmin0, tmin0, curr22);
+    vmax_h_vv(tmax0, tmax0, curr22);
+
+    // Common inner two layers.
+    vld_h_p_x(next10, ptr3);
+    vld_h_p_x(next11, ptr4);
+    vld_h_p_x(next12, ptr5);
+    minmax_p(0, 1, 0);
+    minmax_n(0, 1, 0);
+    minmax_p(0, 1, 1);
+    minmax_n(0, 1, 1);
+    minmax_p(0, 1, 2);
+    minmax_n(0, 1, 2);
+    vmv_v(prev10, curr10);
+    vmv_v(prev11, curr11);
+    vmv_v(prev12, curr12);
+    vmv_v(curr10, next10);
+    vmv_v(curr11, next11);
+    vmv_v(curr12, next12);
+
+    vld_h_p_x(next20, ptr6);
+    vld_h_p_x(next21, ptr7);
+    vld_h_p_x(next22, ptr8);
+    minmax_p(0, 2, 0);
+    minmax_n(0, 2, 0);
+    minmax_p(0, 2, 1);
+    minmax_n(0, 2, 1);
+    minmax_p(0, 2, 2);
+    minmax_n(0, 2, 2);
+    vmv_v(prev20, curr20);
+    vmv_v(prev21, curr21);
+    vmv_v(prev22, curr22);
+    vmv_v(curr20, next20);
+    vmv_v(curr21, next21);
+    vmv_v(curr22, next22);
+
+    // Shared state end.
+    vmv_v(tmax1, tmax0);
+    vmv_v(tmin1, tmin0);
+
+    // [0,1,2]
+    vld_h_p_x(next00, ptr0);
+    vld_h_p_x(next01, ptr1);
+    vld_h_p_x(next02, ptr2);
+    minmax_p(0, 0, 0);
+    minmax_n(0, 0, 0);
+    minmax_p(0, 0, 1);
+    minmax_n(0, 0, 1);
+    minmax_p(0, 0, 2);
+    minmax_n(0, 0, 2);
+    vmv_v(prev00, curr00);
+    vmv_v(prev01, curr01);
+    vmv_v(prev02, curr02);
+    vmv_v(curr00, next00);
+    vmv_v(curr01, next01);
+    vmv_v(curr02, next02);
+
+    minmax_c(0, 0, 0);
+    minmax_c(0, 0, 1);
+    minmax_c(0, 0, 2);
+    minmax_c(0, 2, 1);
+
+    // [1,2,3]
+    vld_h_p_x(next30, ptr9);
+    vld_h_p_x(next31, ptra);
+    vld_h_p_x(next32, ptrb);
+    minmax_p(1, 3, 0);
+    minmax_n(1, 3, 0);
+    minmax_p(1, 3, 1);
+    minmax_n(1, 3, 1);
+    minmax_p(1, 3, 2);
+    minmax_n(1, 3, 2);
+    vmv_v(prev30, curr30);
+    vmv_v(prev31, curr31);
+    vmv_v(prev32, curr32);
+    vmv_v(curr30, next30);
+    vmv_v(curr31, next31);
+    vmv_v(curr32, next32);
+
+    minmax_c(1, 1, 1);
+    minmax_c(1, 3, 0);
+    minmax_c(1, 3, 1);
+    minmax_c(1, 3, 2);
+
+    // Compare center with min:max.
+    vmv_v(value0, prev11);
+    vmv_v(value1, prev21);
+
+    vlt_h_vv(rmin, value0, tmin0);
+    vgt_h_vv(rmax, value0, tmax0);
+    vsll_h_vx(rmax, rmax, 1);
+    vor_vv(result0, rmax, rmin);
+    vevn_b_vv(result0, result0, result0);
+    vst_b_lp_xx(result0, out0, vlenh);
+
+    vlt_h_vv(rmin, value1, tmin1);
+    vgt_h_vv(rmax, value1, tmax1);
+    vsll_h_vx(rmax, rmax, 1);
+    vor_vv(result1, rmax, rmin);
+    vevn_b_vv(result1, result1, result1);
+    vst_b_lp_xx(result1, out1, vlenh);
+  }
+}
+
+};  // namespace kelvin::cv
diff --git a/tests/cv/extrema.h b/tests/cv/extrema.h
new file mode 100644
index 0000000..5872217
--- /dev/null
+++ b/tests/cv/extrema.h
@@ -0,0 +1,20 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TESTS_CV_EXTREMA_H_
+#define TESTS_CV_EXTREMA_H_
+
+#include <cstdint>
+
+namespace kelvin::cv {
+
+// DUT: extrema.cc
+// REF: extrema_test.cc
+
+void extrema(int num_cols, const int16_t *input[4][3], uint8_t *output0,
+             uint8_t *output1);
+
+};  // namespace kelvin::cv
+
+#endif  // TESTS_CV_EXTREMA_H_
diff --git a/tests/cv/extrema_test.cc b/tests/cv/extrema_test.cc
new file mode 100644
index 0000000..81f0163
--- /dev/null
+++ b/tests/cv/extrema_test.cc
@@ -0,0 +1,139 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/extrema.h"
+
+#include <cstdint>
+
+#include "crt/kelvin.h"
+#include "tests/cv/test_helper.h"
+
+enum kMode { kNone, kMinimum, kMaximum };
+
+template <typename TComparisonOp>
+bool IsPointExtrema(const int16_t *input[4][3], int layer_id, int col,
+                    TComparisonOp comparison_op) {
+  const int16_t center_value = input[layer_id][1][col];
+  for (int layer_id_offset = -1; layer_id_offset <= 1; layer_id_offset++) {
+    for (int row_id = 0; row_id < 3; row_id++) {
+      for (int col_offset = -1; col_offset <= 1; col_offset++) {
+        // Do not compare to input[layer_id][1][col] which is value.
+        if (layer_id_offset == 0 && row_id == 1 && col_offset == 0) {
+          continue;
+        }
+        if (!comparison_op(
+                center_value,
+                input[layer_id + layer_id_offset][row_id][col + col_offset])) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+void Extrema(int num_cols, const int16_t *input[4][3], uint8_t *output0,
+             uint8_t *output1) {
+  auto max_comp = [](int x, int y) { return x > y; };
+  auto min_comp = [](int x, int y) { return x < y; };
+
+  // Update extrema for layer 1.
+  for (int col = 1; col < num_cols - 1; col++) {
+    output0[col] = kMode::kNone;
+    if (IsPointExtrema(input, 1, col, max_comp)) {
+      output0[col] = kMaximum;
+      continue;
+    }
+    if (IsPointExtrema(input, 1, col, min_comp)) {
+      output0[col] = kMode::kMinimum;
+    }
+  }
+
+  // Update extrema for layer 2.
+  for (int col = 1; col < num_cols - 1; col++) {
+    output1[col] = kMode::kNone;
+    if (IsPointExtrema(input, 2, col, max_comp)) {
+      output1[col] = kMode::kMaximum;
+      continue;
+    }
+    if (IsPointExtrema(input, 2, col, min_comp)) {
+      output1[col] = kMode::kMinimum;
+    }
+  }
+}
+
+void extrema_test() {
+  constexpr int kNumCols = 640;
+
+  int16_t input0_row0[kNumCols] __attribute__((aligned(64)));
+  int16_t input0_row1[kNumCols] __attribute__((aligned(64)));
+  int16_t input0_row2[kNumCols] __attribute__((aligned(64)));
+  int16_t input1_row0[kNumCols] __attribute__((aligned(64)));
+  int16_t input1_row1[kNumCols] __attribute__((aligned(64)));
+  int16_t input1_row2[kNumCols] __attribute__((aligned(64)));
+  int16_t input2_row0[kNumCols] __attribute__((aligned(64)));
+  int16_t input2_row1[kNumCols] __attribute__((aligned(64)));
+  int16_t input2_row2[kNumCols] __attribute__((aligned(64)));
+  int16_t input3_row0[kNumCols] __attribute__((aligned(64)));
+  int16_t input3_row1[kNumCols] __attribute__((aligned(64)));
+  int16_t input3_row2[kNumCols] __attribute__((aligned(64)));
+
+  uint8_t output0_ref[kNumCols];
+  uint8_t output1_ref[kNumCols];
+  uint8_t output0_dut[kNumCols];
+  uint8_t output1_dut[kNumCols];
+
+  const int16_t *input[4][3] = {{reinterpret_cast<int16_t *>(input0_row0),
+                                 reinterpret_cast<int16_t *>(input0_row1),
+                                 reinterpret_cast<int16_t *>(input0_row2)},
+                                {reinterpret_cast<int16_t *>(input1_row0),
+                                 reinterpret_cast<int16_t *>(input1_row1),
+                                 reinterpret_cast<int16_t *>(input1_row2)},
+                                {reinterpret_cast<int16_t *>(input2_row0),
+                                 reinterpret_cast<int16_t *>(input2_row1),
+                                 reinterpret_cast<int16_t *>(input2_row2)},
+                                {reinterpret_cast<int16_t *>(input3_row0),
+                                 reinterpret_cast<int16_t *>(input3_row1),
+                                 reinterpret_cast<int16_t *>(input3_row2)}};
+
+  krand(kNumCols, input0_row0);
+  krand(kNumCols, input0_row1);
+  krand(kNumCols, input0_row2);
+  krand(kNumCols, input1_row0);
+  krand(kNumCols, input1_row1);
+  krand(kNumCols, input1_row2);
+  krand(kNumCols, input2_row0);
+  krand(kNumCols, input2_row1);
+  krand(kNumCols, input2_row2);
+  krand(kNumCols, input3_row0);
+  krand(kNumCols, input3_row1);
+  krand(kNumCols, input3_row2);
+
+  Extrema(kNumCols, input, output0_ref, output1_ref);
+
+  kelvin::cv::extrema(kNumCols, input, output0_dut, output1_dut);
+
+  for (int i = 1; i < kNumCols - 1; ++i) {
+    const uint8_t ref = output0_ref[i];
+    const uint8_t dut = output0_dut[i];
+    if (ref != dut) {
+      printf("**error::extrema0[%d] %x %x\n", i, ref, dut);
+      exit(1);
+    }
+  }
+
+  for (int i = 1; i < kNumCols - 1; ++i) {
+    const uint8_t ref = output1_ref[i];
+    const uint8_t dut = output1_dut[i];
+    if (ref != dut) {
+      printf("**error::extrema1[%d] %x %x\n", i, ref, dut);
+      exit(1);
+    }
+  }
+}
+
+int main() {
+  extrema_test();
+  return 0;
+}
diff --git a/tests/cv/gaussian.cc b/tests/cv/gaussian.cc
new file mode 100644
index 0000000..520cfe8
--- /dev/null
+++ b/tests/cv/gaussian.cc
@@ -0,0 +1,218 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/gaussian.h"
+
+#include "crt/kelvin.h"
+
+// Note: separable kernel is vertical then horizontal. H then V with the
+// intermediate horizontal is retained may reduce compute further.
+
+namespace kelvin::cv {
+
+static void GaussianVerticalKernel(int num_cols, const uint16_t* input0,
+                                   const uint16_t* input1,
+                                   const uint16_t* input2,
+                                   const uint16_t* input3,
+                                   const uint16_t* input4, bool is_stripmine,
+                                   uint32_t* output0, uint32_t* output1) {
+  uint32_t vl_input, vl_output;
+  while (num_cols > 0) {
+    if (is_stripmine) {
+      getvl_h_x_m(vl_input, num_cols);
+      num_cols -= vl_input;
+      vl_output = vl_input / 2;
+      vld_h_lp_xx_m(vm8, input0, vl_input);
+      vld_h_lp_xx_m(vm12, input4, vl_input);
+      vld_h_lp_xx_m(vm9, input1, vl_input);
+      vld_h_lp_xx_m(vm10, input2, vl_input);
+      vld_h_lp_xx_m(vm11, input3, vl_input);
+
+      vaddw_w_u_vv_m(vm0, vm8, vm12);
+      vmulw_w_u_vx_m(vm2, vm9, 4);
+      vmulw_w_u_vx_m(vm4, vm10, 6);
+      vmulw_w_u_vx_m(vm6, vm11, 4);
+
+      vadd3_w_vv_m(vm0, vm2, vm4);
+      vadd3_w_vv_m(vm1, vm3, vm5);
+      vadd_w_vv_m(vm0, vm0, vm6);
+      vadd_w_vv_m(vm1, vm1, vm7);
+
+      vst_w_lp_xx_m(vm0, output0, vl_output);
+      vst_w_lp_xx_m(vm1, output1, vl_output);
+    } else {
+      getvl_h_x(vl_input, num_cols);
+      num_cols -= vl_input;
+      vl_output = vl_input / 2;
+      vld_h_lp_xx(v10, input0, vl_input);
+      vld_h_lp_xx(v11, input1, vl_input);
+      vld_h_lp_xx(v12, input2, vl_input);
+      vld_h_lp_xx(v13, input3, vl_input);
+      vld_h_lp_xx(v14, input4, vl_input);
+
+      vaddw_w_u_vv(v16, v10, v14);
+      vmulw_w_u_vx(v18, v11, 4);
+      vmulw_w_u_vx(v20, v12, 6);
+      vmulw_w_u_vx(v22, v13, 4);
+
+      vadd3_w_vv(v16, v18, v20);
+      vadd3_w_vv(v17, v19, v21);
+      vadd_w_vv(v16, v16, v22);
+      vadd_w_vv(v17, v17, v23);
+
+      vst_w_lp_xx(v16, output0, vl_output);
+      vst_w_lp_xx(v17, output1, vl_output);
+    }
+  }
+}
+
+static void GaussianHorizontalKernel(int num_cols, const uint32_t* input0,
+                                     const uint32_t* input1, bool is_stripmine,
+                                     uint16_t* output) {
+#define PREV0 vm8
+#define PREV1 vm9
+#define CURR0 vm10
+#define CURR1 vm11
+#define NEXT0 vm12
+#define NEXT1 vm13
+#define P0 vm4
+#define P1 vm5
+#define N0 vm6
+#define N1 vm7
+#define SN vm14
+
+#define Rm0 vm0
+#define Rm1 vm1
+#define R0 v4
+#define R1 v5
+#define T0 vm2
+#define T1 vm3
+
+  uint32_t vl_input, vl_output;
+
+  if (is_stripmine) {
+    getmaxvl_w_m(vl_input);
+
+    vld_w_x_m(PREV0, input0 - vl_input);
+    vld_w_x_m(PREV1, input1 - vl_input);
+    vld_w_p_x_m(CURR0, input0);
+    vld_w_p_x_m(CURR1, input1);
+  } else {
+    getmaxvl_w(vl_input);
+
+    vld_w_x(PREV0, input0 - vl_input);
+    vld_w_x(PREV1, input1 - vl_input);
+    vld_w_p_x(CURR0, input0);
+    vld_w_p_x(CURR1, input1);
+  }
+
+  while (num_cols > 0) {
+    if (is_stripmine) {
+      getvl_h_x_m(vl_output, num_cols);
+      num_cols -= vl_output;
+
+      vld_w_p_x_m(NEXT0, input0);
+      vld_w_p_x_m(NEXT1, input1);
+
+      vslidehp_w_1_vv_m(P0, PREV0, CURR0);
+      vslidehp_w_1_vv_m(P1, PREV1, CURR1);
+      vslidehn_w_1_vv_m(N0, CURR0, NEXT0);
+      vslidehn_w_1_vv_m(N1, CURR1, NEXT1);
+
+      // even / odd, with additional accumulator
+      vmul_w_vx_m(Rm0, P1, 4);
+      vmul_w_vx_m(Rm1, CURR0, 4);
+      vadd_w_vv_m(T0, P0, N0);
+      vadd_w_vv_m(T1, P1, N1);
+      vmacc_w_vx_m(Rm0, CURR0, 6);
+      vmacc_w_vx_m(Rm1, CURR1, 6);
+      vmacc_w_vx_m(T0, CURR1, 4);
+      vmacc_w_vx_m(T1, N0, 4);
+      vadd_w_vv_m(Rm0, Rm0, T0);
+      vadd_w_vv_m(Rm1, Rm1, T1);
+
+      vsransu_h_r_vx_m(SN, Rm0, 8);
+
+      vst_h_lp_xx_m(SN, output, vl_output);
+
+      vmv_v_m(PREV0, CURR0);
+      vmv_v_m(PREV1, CURR1);
+      vmv_v_m(CURR0, NEXT0);
+      vmv_v_m(CURR1, NEXT1);
+    } else {
+      getvl_h_x(vl_output, num_cols);
+      num_cols -= vl_output;
+
+      vld_w_p_x(NEXT0, input0);
+      vld_w_p_x(NEXT1, input1);
+
+      vslidep_w_1_vv(P0, PREV0, CURR0);
+      vslidep_w_1_vv(P1, PREV1, CURR1);
+      vsliden_w_1_vv(N0, CURR0, NEXT0);
+      vsliden_w_1_vv(N1, CURR1, NEXT1);
+
+      // even
+      vadd_w_vv(R0, P0, N0);
+      vmacc_w_vx(R0, P1, 4);
+      vmacc_w_vx(R0, CURR0, 6);
+      vmacc_w_vx(R0, CURR1, 4);
+
+      // odd
+      vadd_w_vv(R1, P1, N1);
+      vmacc_w_vx(R1, CURR0, 4);
+      vmacc_w_vx(R1, CURR1, 6);
+      vmacc_w_vx(R1, N0, 4);
+
+      vsransu_h_r_vx(SN, R0, 8);
+
+      vst_h_lp_xx(SN, output, vl_output);
+
+      vmv_v(PREV0, CURR0);
+      vmv_v(PREV1, CURR1);
+      vmv_v(CURR0, NEXT0);
+      vmv_v(CURR1, NEXT1);
+    }
+  }
+}
+
+#define ARRAYSIZE(x) sizeof(x) / sizeof(x[0])
+
+void gaussian(int num_cols, const uint16_t* input0_row,
+              const uint16_t* input1_row, const uint16_t* input2_row,
+              const uint16_t* input3_row, const uint16_t* input4_row,
+              bool is_stripmine, uint16_t* output_row) {
+  int vlenw;
+  getmaxvl_w(vlenw);
+  const int interleave_num = num_cols / 2 - 1;  // even/odd interleaved
+  uint32_t temp0_data_unpadded[1024 + 2 * vlenw] __attribute__((aligned(64)));
+  uint32_t temp1_data_unpadded[1024 + 2 * vlenw] __attribute__((aligned(64)));
+  uint32_t* temp0_data = temp0_data_unpadded + vlenw;
+  uint32_t* temp1_data = temp1_data_unpadded + vlenw;
+
+  GaussianVerticalKernel(num_cols, input0_row, input1_row, input2_row,
+                         input3_row, input4_row, is_stripmine, temp0_data,
+                         temp1_data);
+  if (temp0_data <= temp0_data_unpadded ||
+      ((temp0_data - temp0_data_unpadded) / sizeof(uint32_t) + interleave_num +
+           1 >=
+       ARRAYSIZE(temp0_data_unpadded))) {
+    printf("**error**: temp0_data out of bound\n");
+    exit(1);
+  }
+  if (temp1_data <= temp1_data_unpadded ||
+      ((temp1_data - temp1_data_unpadded) / sizeof(uint32_t) + interleave_num +
+           1 >=
+       ARRAYSIZE(temp1_data_unpadded))) {
+    printf("**error**: temp1_data out of bound\n");
+    exit(1);
+  }
+  temp0_data[-1] = temp0_data[0];
+  temp1_data[-1] = temp0_data[0];
+  temp0_data[interleave_num + 1] = temp1_data[interleave_num];
+  temp1_data[interleave_num + 1] = temp1_data[interleave_num];
+  GaussianHorizontalKernel(num_cols, temp0_data, temp1_data, is_stripmine,
+                           output_row);
+}
+
+};  // namespace kelvin::cv
diff --git a/tests/cv/gaussian.h b/tests/cv/gaussian.h
new file mode 100644
index 0000000..c06f21a
--- /dev/null
+++ b/tests/cv/gaussian.h
@@ -0,0 +1,22 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TESTS_CV_GAUSSIAN_H_
+#define TESTS_CV_GAUSSIAN_H_
+
+#include <cstdint>
+
+namespace kelvin::cv {
+
+// DUT: gaussian.cc
+// REF: gaussian_test.cc
+
+void gaussian(int num_output_cols, const uint16_t* input0_row,
+              const uint16_t* input1_row, const uint16_t* input2_row,
+              const uint16_t* input3_row, const uint16_t* input4_row,
+              bool is_stripmine, uint16_t* output_row);
+
+};  // namespace kelvin::cv
+
+#endif  // TESTS_CV_GAUSSIAN_H_
diff --git a/tests/cv/gaussian_test.cc b/tests/cv/gaussian_test.cc
new file mode 100644
index 0000000..e310b3e
--- /dev/null
+++ b/tests/cv/gaussian_test.cc
@@ -0,0 +1,63 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/gaussian.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "crt/kelvin.h"
+#include "tests/cv/test_helper.h"
+
+void gaussian_test() {
+  constexpr int num_cols = 640;
+  uint16_t input0_row[num_cols] __attribute__((aligned(64)));
+  uint16_t input1_row[num_cols] __attribute__((aligned(64)));
+  uint16_t input2_row[num_cols] __attribute__((aligned(64)));
+  uint16_t input3_row[num_cols] __attribute__((aligned(64)));
+  uint16_t input4_row[num_cols] __attribute__((aligned(64)));
+  uint16_t output_stripmined_row[num_cols] __attribute__((aligned(64))) = {0};
+  uint16_t output_row[num_cols] __attribute__((aligned(64))) = {0};
+  krand(num_cols, input0_row);
+  krand(num_cols, input1_row);
+  krand(num_cols, input2_row);
+  krand(num_cols, input3_row);
+  krand(num_cols, input4_row);
+
+  kelvin::cv::gaussian(num_cols, input0_row, input1_row, input2_row, input3_row,
+                       input4_row, true /*is_stripmine*/,
+                       output_stripmined_row);
+  kelvin::cv::gaussian(num_cols, input0_row, input1_row, input2_row, input3_row,
+                       input4_row, false /*is_stripmine*/, output_row);
+
+  for (int i = 0; i < num_cols; ++i) {
+    uint32_t h[5];
+    for (int j = 0; j < 5; j++) {
+      int idx = std::min(num_cols - 1, std::max(0, i + j - 2));
+      uint32_t v = 0;
+      v += input0_row[idx];
+      v += input1_row[idx] * 4;
+      v += input2_row[idx] * 6;
+      v += input3_row[idx] * 4;
+      v += input4_row[idx];
+      h[j] = v;
+    }
+    const uint32_t k = h[0] + h[1] * 4 + h[2] * 6 + h[3] * 4 + h[4] + 128;
+    const uint16_t ref_value = k >> 8;
+    if (ref_value != output_stripmined_row[i]) {
+      printf("**error::stripmined gaussian[%d] %x %x\n", i, ref_value,
+             output_stripmined_row[i]);
+      exit(1);
+    }
+    if (ref_value != output_row[i]) {
+      printf("**error::gaussian[%d] %x %x\n", i, ref_value, output_row[i]);
+      exit(1);
+    }
+  }
+}
+
+int main() {
+  gaussian_test();
+  return 0;
+}
diff --git a/tests/cv/shift_gaussian.cc b/tests/cv/shift_gaussian.cc
new file mode 100644
index 0000000..be5aa93
--- /dev/null
+++ b/tests/cv/shift_gaussian.cc
@@ -0,0 +1,171 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/shift_gaussian.h"
+
+#include <cstdint>
+
+#include "crt/kelvin.h"
+
+// Note: separable kernel is vertical then horizontal. H then V with the
+// intermediate horizontal is retained may reduce compute further.
+
+namespace kelvin::cv {
+
+static void GaussianVerticalKernel(int num_cols, const uint8_t* input0,
+                                   const uint8_t* input1, const uint8_t* input2,
+                                   const uint8_t* input3, const uint8_t* input4,
+                                   bool is_stripmine, uint16_t* output) {
+  uint32_t vl_input, vl_output_0, vl_output_1;
+  while (num_cols > 0) {
+    if (is_stripmine) {
+      getvl_b_x_m(vl_input, num_cols);
+      getvl_h_x_m(vl_output_0, num_cols);
+      num_cols -= vl_input;
+      vl_output_1 = vl_input - vl_output_0;
+
+      vld_b_lp_xx_m(v32, input0, vl_input);
+      vld_b_lp_xx_m(v36, input1, vl_input);
+      vld_b_lp_xx_m(v40, input2, vl_input);
+      vld_b_lp_xx_m(v44, input3, vl_input);
+      vld_b_lp_xx_m(v48, input4, vl_input);
+
+      vaddw_h_u_vv_m(v0, v32, v48);
+      vmulw_h_u_vx_m(v8, v36, 4);
+      vmulw_h_u_vx_m(v16, v40, 6);
+      vmulw_h_u_vx_m(v24, v44, 4);
+
+      vadd3_h_vv_m(v0, v8, v16);
+      vadd3_h_vv_m(v4, v12, v20);
+      vadd_h_vv_m(v0, v0, v24);
+      vadd_h_vv_m(v4, v4, v28);
+
+      vzip_h_vv_m(v16, v0, v4);
+
+      vst_h_lp_xx_m(v16, output, vl_output_0);
+      vst_h_lp_xx_m(v20, output, vl_output_1);
+    } else {
+      getvl_b_x(vl_input, num_cols);
+      getvl_h_x(vl_output_0, num_cols);
+      num_cols -= vl_input;
+      vl_output_1 = vl_input - vl_output_0;
+
+      vld_b_lp_xx(v10, input0, vl_input);
+      vld_b_lp_xx(v11, input1, vl_input);
+      vld_b_lp_xx(v12, input2, vl_input);
+      vld_b_lp_xx(v13, input3, vl_input);
+      vld_b_lp_xx(v14, input4, vl_input);
+
+      vaddw_h_u_vv(v16, v10, v14);
+      vmulw_h_u_vx(v18, v11, 4);
+      vmulw_h_u_vx(v20, v12, 6);
+      vmulw_h_u_vx(v22, v13, 4);
+
+      vadd3_h_vv(v16, v18, v20);
+      vadd3_h_vv(v17, v19, v21);
+      vadd_h_vv(v16, v16, v22);
+      vadd_h_vv(v17, v17, v23);
+
+      vzip_h_vv(v0, v16, v17);
+
+      vst_h_lp_xx(v0, output, vl_output_0);
+      vst_h_lp_xx(v1, output, vl_output_1);
+    }
+  }
+}
+
+static void GaussianHorizontalKernel(int num_cols, const uint16_t* input,
+                                     bool is_stripmine, uint16_t* output) {
+#define PREV v32
+#define CURR v40
+#define NEXT v48
+#define P2 v16
+#define P1 v20
+#define N1 v24
+#define N2 v28
+#define RS v0
+
+  uint32_t vl_input, vl_output;
+
+  if (is_stripmine) {
+    getmaxvl_h_m(vl_input);
+
+    vld_h_x_m(PREV, input - vl_input);
+    vld_h_p_x_m(CURR, input);
+  } else {
+    getmaxvl_h(vl_input);
+
+    vld_h_x(PREV, input - vl_input);
+    vld_h_p_x(CURR, input);
+  }
+
+  while (num_cols > 0) {
+    if (is_stripmine) {
+      getvl_h_x_m(vl_output, num_cols);
+      num_cols -= vl_output;
+
+      vld_h_p_x_m(NEXT, input);
+
+      vslidehp_h_2_vv_m(P2, PREV, CURR);
+      vslidehp_h_1_vv_m(P1, PREV, CURR);
+      vslidehn_h_1_vv_m(N1, CURR, NEXT);
+      vslidehn_h_2_vv_m(N2, CURR, NEXT);
+
+      vadd_h_vv_m(RS, P2, N2);
+      vmacc_h_vx_m(RS, P1, 4);
+      vmacc_h_vx_m(RS, CURR, 6);
+      vmacc_h_vx_m(RS, N1, 4);
+
+      vst_h_lp_xx_m(RS, output, vl_output);
+
+      vmv_v_m(PREV, CURR);
+      vmv_v_m(CURR, NEXT);
+    } else {
+      getvl_h_x(vl_output, num_cols);
+      num_cols -= vl_output;
+
+      vld_h_p_x(NEXT, input);
+
+      vslidep_h_2_vv(P2, PREV, CURR);
+      vslidep_h_1_vv(P1, PREV, CURR);
+      vsliden_h_1_vv(N1, CURR, NEXT);
+      vsliden_h_2_vv(N2, CURR, NEXT);
+
+      vadd_h_vv(RS, P2, N2);
+      vmacc_h_vx(RS, P1, 4);
+      vmacc_h_vx(RS, CURR, 6);
+      vmacc_h_vx(RS, N1, 4);
+
+      vst_h_lp_xx(RS, output, vl_output);
+
+      vmv_v(PREV, CURR);
+      vmv_v(CURR, NEXT);
+    }
+  }
+}
+
+void shift_gaussian(int num_cols, const uint8_t* input0_row,
+                    const uint8_t* input1_row, const uint8_t* input2_row,
+                    const uint8_t* input3_row, const uint8_t* input4_row,
+                    bool is_stripmine, uint16_t* output_row) {
+  int vlenh;
+  getmaxvl_h(vlenh);
+  const int r = num_cols - 1;
+  uint16_t temp_data_unpadded[1024 + 2 * vlenh] __attribute__((aligned(64)));
+  uint16_t* temp_data = temp_data_unpadded + vlenh;
+
+  GaussianVerticalKernel(num_cols, input0_row, input1_row, input2_row,
+                         input3_row, input4_row, is_stripmine, temp_data);
+  if (temp_data <= &temp_data_unpadded[1]) {
+    printf("**error**: temp_data out of bound\n");
+    exit(1);
+  }
+  temp_data[-1] = temp_data[0];
+  temp_data[-2] = temp_data[0];
+  temp_data[r + 1] = temp_data[r];
+  temp_data[r + 2] = temp_data[r];
+  GaussianHorizontalKernel(num_cols, temp_data, is_stripmine, output_row);
+}
+
+};  // namespace kelvin::cv
diff --git a/tests/cv/shift_gaussian.h b/tests/cv/shift_gaussian.h
new file mode 100644
index 0000000..4d494d6
--- /dev/null
+++ b/tests/cv/shift_gaussian.h
@@ -0,0 +1,22 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TESTS_CV_SHIFT_GAUSSIAN_H_
+#define TESTS_CV_SHIFT_GAUSSIAN_H_
+
+#include <cstdint>
+
+namespace kelvin::cv {
+
+// DUT: shift_gaussian.cc
+// REF: shift_gaussian_test.cc
+
+void shift_gaussian(int num_cols, const uint8_t* input0_row,
+                    const uint8_t* input1_row, const uint8_t* input2_row,
+                    const uint8_t* input3_row, const uint8_t* input4_row,
+                    bool is_stripmine, uint16_t* output_row);
+
+};  // namespace kelvin::cv
+
+#endif  // TESTS_CV_SHIFT_GAUSSIAN_H_
diff --git a/tests/cv/shift_gaussian_test.cc b/tests/cv/shift_gaussian_test.cc
new file mode 100644
index 0000000..813e50b
--- /dev/null
+++ b/tests/cv/shift_gaussian_test.cc
@@ -0,0 +1,65 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/shift_gaussian.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "crt/kelvin.h"
+#include "tests/cv/test_helper.h"
+
+void shift_gaussian_test() {
+  constexpr int kNumCols = 640;
+  uint8_t input0_row[kNumCols] __attribute__((aligned(64)));
+  uint8_t input1_row[kNumCols] __attribute__((aligned(64)));
+  uint8_t input2_row[kNumCols] __attribute__((aligned(64)));
+  uint8_t input3_row[kNumCols] __attribute__((aligned(64)));
+  uint8_t input4_row[kNumCols] __attribute__((aligned(64)));
+  uint16_t output_stripmined_row[kNumCols] __attribute__((aligned(64))) = {0};
+  uint16_t output_row[kNumCols] __attribute__((aligned(64))) = {0};
+  krand(kNumCols, input0_row);
+  krand(kNumCols, input1_row);
+  krand(kNumCols, input2_row);
+  krand(kNumCols, input3_row);
+  krand(kNumCols, input4_row);
+
+  kelvin::cv::shift_gaussian(kNumCols, input0_row, input1_row, input2_row,
+                             input3_row, input4_row, true /*is_stripmine*/,
+                             output_stripmined_row);
+
+  kelvin::cv::shift_gaussian(kNumCols, input0_row, input1_row, input2_row,
+                             input3_row, input4_row, false /*is_stripmine*/,
+                             output_row);
+
+  for (int i = 0; i < kNumCols; ++i) {
+    uint16_t h[5];
+    for (int j = 0; j < 5; j++) {
+      int idx = std::min(kNumCols - 1, std::max(0, i + j - 2));
+      uint16_t v = 0;
+      v += input0_row[idx];
+      v += input1_row[idx] * 4;
+      v += input2_row[idx] * 6;
+      v += input3_row[idx] * 4;
+      v += input4_row[idx];
+      h[j] = v;
+    }
+    const uint16_t ref_value = h[0] + h[1] * 4 + h[2] * 6 + h[3] * 4 + h[4];
+    if (ref_value != output_stripmined_row[i]) {
+      printf("**error::stripmine shift_gaussian[%d] %x %x\n", i, ref_value,
+             output_stripmined_row[i]);
+      exit(1);
+    }
+    if (ref_value != output_row[i]) {
+      printf("**error::shift_gaussian[%d] %x %x\n", i, ref_value,
+             output_row[i]);
+      exit(1);
+    }
+  }
+}
+
+int main() {
+  shift_gaussian_test();
+  return 0;
+}
diff --git a/tests/cv/test_helper.h b/tests/cv/test_helper.h
new file mode 100644
index 0000000..b0e7f8f
--- /dev/null
+++ b/tests/cv/test_helper.h
@@ -0,0 +1,29 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TESTS_CV_TEST_HELPER_H_
+#define TESTS_CV_TEST_HELPER_H_
+
+#include <cstdint>
+
+static uint32_t krand(void) {
+  static uint32_t x = 123456789;
+  static uint32_t y = 362436069;
+  static uint32_t z = 521288629;
+  static uint32_t w = 88675123;
+  uint32_t t = x ^ (x << 11);
+  x = y;
+  y = z;
+  z = w;
+  return w = w ^ (w >> 19) ^ (t ^ (t >> 8));
+}
+
+template <typename T>
+void krand(int len, T* data) {
+  for (int i = 0; i < len; ++i) {
+    data[i] = krand();
+  }
+}
+
+#endif  // TESTS_CV_TEST_HELPER_H_
diff --git a/tests/cv/upsample.cc b/tests/cv/upsample.cc
new file mode 100644
index 0000000..6c6ea20
--- /dev/null
+++ b/tests/cv/upsample.cc
@@ -0,0 +1,144 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/upsample.h"
+
+#include <cstdint>
+
+#include "crt/kelvin.h"
+
+namespace kelvin::cv {
+
+void upsample(int num_output_cols, uint16_t* input0_row, uint16_t* input1_row,
+              uint16_t* output0_row, uint16_t* output1_row) {
+  // Mirror the edges using padded input buffers.
+  {
+    const int r = (num_output_cols / 2) - 1;
+    input0_row[-1] = input0_row[0];
+    input1_row[-1] = input1_row[0];
+    input0_row[r + 1] = input0_row[r];
+    input1_row[r + 1] = input1_row[r];
+  }
+
+  int vlenh;
+  getmaxvl_h(vlenh);
+
+  uint16_t* __restrict input0 =
+      reinterpret_cast<uint16_t*>((input0_row - vlenh));
+  uint16_t* __restrict input1 =
+      reinterpret_cast<uint16_t*>((input1_row - vlenh));
+  uint16_t* __restrict output0 = reinterpret_cast<uint16_t*>(output0_row);
+  uint16_t* __restrict output1 = reinterpret_cast<uint16_t*>(output1_row);
+
+#define prev0 v0
+#define prev1 v1
+#define curr0 v2
+#define curr1 v3
+#define next0 v4
+#define next1 v5
+#define c0 curr0
+#define c1 curr1
+#define p0 v6
+#define p0_0 v6
+#define p0_1 v7
+#define p1 v8
+#define p1_0 v8
+#define p1_1 v9
+#define n0 v10
+#define n0_0 v10
+#define n0_1 v11
+#define n1 v12
+#define n1_0 v12
+#define n1_1 v13
+#define a v14
+#define a_0 v14
+#define a_1 v15
+#define b v16
+#define b_0 v16
+#define b_1 v17
+#define ae v18
+#define ae_0 v18
+#define ae_1 v19
+#define be v20
+#define be_0 v20
+#define be_1 v21
+#define ao v22
+#define ao_0 v22
+#define ao_1 v23
+#define bo v24
+#define bo_0 v24
+#define bo_1 v25
+#define re0 v26
+#define re0_0 v26
+#define re0_1 v27
+#define re1 v28
+#define re1_0 v28
+#define re1_1 v29
+#define ro0 v30
+#define ro0_0 v30
+#define ro0_1 v31
+#define ro1 v32
+#define ro1_0 v32
+#define ro1_1 v33
+#define out0 v34
+#define out0_0 v34
+#define out0_1 v35
+#define out1 v36
+#define out1_0 v36
+#define out1_1 v37
+
+  vld_h_p_x(prev0, input0);
+  vld_h_p_x(prev1, input1);
+
+  vld_h_p_x(curr0, input0);
+  vld_h_p_x(curr1, input1);
+
+  for (int i = 0; i < num_output_cols; i += 2 * vlenh) {
+    vld_h_p_x(next0, input0);
+    vld_h_p_x(next1, input1);
+
+    vslidep_h_1_vv(p0, prev0, curr0);
+    vslidep_h_1_vv(p1, prev1, curr1);
+    vsliden_h_1_vv(n0, curr0, next0);
+    vsliden_h_1_vv(n1, curr1, next1);
+
+    vmulw_w_u_vx(ae, c0, 3);
+    vmulw_w_u_vx(be, c1, 3);
+    vacc_w_u_vv(ao, ae, n0);
+    vacc_w_u_vv(bo, be, n1);
+    vacc_w_u_vv(ae, ae, p0);
+    vacc_w_u_vv(be, be, p1);
+
+    vmvp_vv(re0, be_0, be_1);
+    vmvp_vv(re1, ae_0, ae_1);
+    vmvp_vv(ro0, bo_0, bo_1);
+    vmvp_vv(ro1, ao_0, ao_1);
+
+    vmacc_w_vx(re0_0, ae_0, 3);
+    vmacc_w_vx(re0_1, ae_1, 3);
+    vmacc_w_vx(re1_0, be_0, 3);
+    vmacc_w_vx(re1_1, be_1, 3);
+    vmacc_w_vx(ro0_0, ao_0, 3);
+    vmacc_w_vx(ro0_1, ao_1, 3);
+    vmacc_w_vx(ro1_0, bo_0, 3);
+    vmacc_w_vx(ro1_1, bo_1, 3);
+
+    vsransu_h_r_vx(re0, re0, 4);
+    vsransu_h_r_vx(re1, re1, 4);
+    vsransu_h_r_vx(ro0, ro0, 4);
+    vsransu_h_r_vx(ro1, ro1, 4);
+
+    vzip_h_vv(out0, re0, ro0);
+    vzip_h_vv(out1, re1, ro1);
+    vst_h_p_x(out0_0, output0);
+    vst_h_p_x(out0_1, output0);
+    vst_h_p_x(out1_0, output1);
+    vst_h_p_x(out1_1, output1);
+
+    vmvp_vv(prev0, curr0, curr1);
+    vmvp_vv(curr0, next0, next1);
+  }
+}
+
+};  // namespace kelvin::cv
diff --git a/tests/cv/upsample.h b/tests/cv/upsample.h
new file mode 100644
index 0000000..1831a28
--- /dev/null
+++ b/tests/cv/upsample.h
@@ -0,0 +1,20 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TESTS_CV_UPSAMPLE_H_
+#define TESTS_CV_UPSAMPLE_H_
+
+#include <cstdint>
+
+namespace kelvin::cv {
+
+// DUT: upsample.cc
+// REF: upsample_test.cc
+
+void upsample(int num_output_cols, uint16_t* input0_row, uint16_t* input1_row,
+              uint16_t* output0_row, uint16_t* output1_row);
+
+};  // namespace kelvin::cv
+
+#endif  // TESTS_CV_UPSAMPLE_H_
diff --git a/tests/cv/upsample_test.cc b/tests/cv/upsample_test.cc
new file mode 100644
index 0000000..c672575
--- /dev/null
+++ b/tests/cv/upsample_test.cc
@@ -0,0 +1,60 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tests/cv/upsample.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "crt/kelvin.h"
+#include "tests/cv/test_helper.h"
+
+void upsample_test() {
+  constexpr int kNumOutputCols = 640;
+  constexpr int kEdge = 32;            // 512 / 16
+  constexpr int kPadding = kEdge * 2;  // left/right
+
+  uint16_t input0_row[kNumOutputCols / 2 + kPadding];
+  uint16_t input1_row[kNumOutputCols / 2 + kPadding];
+  uint16_t output0_row[kNumOutputCols];
+  uint16_t output1_row[kNumOutputCols];
+  uint16_t *input0_data = input0_row + kEdge;
+  uint16_t *input1_data = input1_row + kEdge;
+
+  krand(kNumOutputCols / 2 + kPadding, input0_row);
+  krand(kNumOutputCols / 2 + kPadding, input1_row);
+
+  kelvin::cv::upsample(kNumOutputCols, input0_data, input1_data, output0_row,
+                       output1_row);
+
+  constexpr int kHalfWidth = kNumOutputCols / 2 - 1;
+  for (int i = 0; i < kNumOutputCols; ++i) {
+    int c1 = std::clamp(i / 2, 0, kHalfWidth);
+    int c2 = std::clamp(i & 1 ? c1 + 1 : c1 - 1, 0, kHalfWidth);
+
+    const uint32_t a = 3 * input0_data[c1] + input0_data[c2];
+    const uint32_t b = 3 * input1_data[c1] + input1_data[c2];
+
+    const uint16_t ref0_value = (a * 3 + b + 8) / 16;
+    const uint16_t ref1_value = (b * 3 + a + 8) / 16;
+
+    if (ref0_value != output0_row[i]) {
+      printf("**error::upsample_test[%d,%d] %x %x\n", 0, i, ref0_value,
+             output0_row[i]);
+      exit(1);
+    }
+
+    if (ref1_value != output1_row[i]) {
+      printf("**error::upsample_test[%d,%d] %x %x\n", 1, i, ref1_value,
+             output1_row[i]);
+      exit(1);
+    }
+  }
+}
+
+int main() {
+  upsample_test();
+
+  return 0;
+}