Widen math ops rvv intrinsics test.

NOTE: This test uses a workaround to bipass LSU multi store issue

Change-Id: I64898817971275e708122ac9a8beb45f5e0144c2
diff --git a/kelvin_test_utils/sim_test_fixture.py b/kelvin_test_utils/sim_test_fixture.py
index 2f0ac06..ae85ad0 100644
--- a/kelvin_test_utils/sim_test_fixture.py
+++ b/kelvin_test_utils/sim_test_fixture.py
@@ -16,7 +16,9 @@
 
 from kelvin_test_utils.core_mini_axi_interface import CoreMiniAxiInterface
 
+
 class Fixture:
+
     def __init__(self, dut):
         self.core_mini_axi = CoreMiniAxiInterface(dut)
         self.entry_point = None
@@ -49,6 +51,6 @@
     async def read(self, symbol: str, size: int):
         return await self.core_mini_axi.read(self.symbols[symbol], size)
 
-    async def run_to_halt(self):
+    async def run_to_halt(self, timeout_cycles=10000):
         await self.core_mini_axi.execute_from(self.entry_point)
-        await self.core_mini_axi.wait_for_halted()
+        await self.core_mini_axi.wait_for_halted(timeout_cycles=timeout_cycles)
diff --git a/tests/cocotb/rvv/arithmetics/BUILD b/tests/cocotb/rvv/arithmetics/BUILD
index 5d2b813..8861569 100644
--- a/tests/cocotb/rvv/arithmetics/BUILD
+++ b/tests/cocotb/rvv/arithmetics/BUILD
@@ -14,11 +14,22 @@
 
 load("//rules:kelvin_v2.bzl", "kelvin_v2_binary")
 load("//rules:utils.bzl", "template_rule")
-load("//tests/cocotb/rvv/arithmetics:rvv_arithmetic.bzl", "rvv_arithmetic_test", "rvv_reduction_test")
+load("//tests/cocotb/rvv/arithmetics:rvv_arithmetic.bzl", "rvv_arithmetic_test", "rvv_reduction_test", "rvv_widen_arithmetic_test")
+
 package(default_visibility = ["//visibility:public"])
 
-MATH_OPS = ["add", "sub", "mul","div"]
-REDUCTION_OPS = ["redsum", "redmin", "redmax"]
+MATH_OPS = [
+    "add",
+    "sub",
+    "mul",
+    "div",
+]
+
+REDUCTION_OPS = [
+    "redsum",
+    "redmin",
+    "redmax",
+]
 
 # tuple format DTYPE (sew, sign, dtype, vl)
 DTYPES = [
@@ -29,8 +40,29 @@
     ("16", "u", "uint16", "8"),
     ("32", "u", "uint32", "4"),
 ]
-MATH_OP_TYPE_PAIRS = [(op, sew, sign, dtype, vl) for op in MATH_OPS for (sew, sign, dtype, vl) in DTYPES]
-REDUCTION_OP_TYPE_PAIRS = [(op, sew, sign, dtype, vl) for op in REDUCTION_OPS for (sew, sign, dtype, vl) in DTYPES]
+
+WIDEN_DTYPES = [
+    ("i", "int8", "int16", "8", "16", "8", "256"),
+    ("i", "int16", "int32", "16", "32", "4", "256"),
+]
+
+MATH_OP_TYPE_PAIRS = [
+    (op, sew, sign, dtype, vl)
+    for op in MATH_OPS
+    for (sew, sign, dtype, vl) in DTYPES
+]
+
+MATH_WIDEN_OP_TYPE_PAIRS = [
+    (op, sign, in_dtype, out_dtype, in_sew, out_sew, vl_step, num_test_values)
+    for op in MATH_OPS[:3]
+    for (sign, in_dtype, out_dtype, in_sew, out_sew, vl_step, num_test_values) in WIDEN_DTYPES
+]
+
+REDUCTION_OP_TYPE_PAIRS = [
+    (op, sew, sign, dtype, vl)
+    for op in REDUCTION_OPS
+    for (sew, sign, dtype, vl) in DTYPES
+]
 
 # Division has different op code for signed and usigned
 template_rule(
@@ -41,12 +73,12 @@
             "sew": sew,
             "sign": sign,
             "num_operands": vl,
-            "math_op": ['divu' if op == 'div' and dtype[0] == "u" else op][0],
+            "math_op": ["divu" if op == "div" and dtype[0] == "u" else op][0],
             "in_data_size": "16",
             "out_data_size": "16",
         }
         for (op, sew, sign, dtype, vl) in MATH_OP_TYPE_PAIRS
-    }
+    },
 )
 
 template_rule(
@@ -63,7 +95,24 @@
             "out_data_size": "16",
         }
         for (op, sew, sign, dtype, vl) in REDUCTION_OP_TYPE_PAIRS
-    }
+    },
+)
+
+template_rule(
+    rvv_widen_arithmetic_test,
+    {
+        "template_widen_{}_{}_{}".format(op, in_dtype, out_dtype): {
+            "in_dtype": in_dtype,
+            "out_dtype": out_dtype,
+            "in_sew": in_sew,
+            "out_sew": out_sew,
+            "sign": sign,
+            "step_operands": vl_step,
+            "math_op": op,
+            "num_test_values": num_test_values,
+        }
+        for (op, sign, in_dtype, out_dtype, in_sew, out_sew, vl_step, num_test_values) in MATH_WIDEN_OP_TYPE_PAIRS
+    },
 )
 
 template_rule(
@@ -73,7 +122,17 @@
             "srcs": ["template_{}_{}_m1".format(op, dtype)],
         }
         for (op, _, _, dtype, _) in MATH_OP_TYPE_PAIRS + REDUCTION_OP_TYPE_PAIRS
-    }
+    },
+)
+
+template_rule(
+    kelvin_v2_binary,
+    {
+        "rvv_widen_{}_{}_{}".format(op, in_dtype, out_dtype): {
+            "srcs": ["template_widen_{}_{}_{}".format(op, in_dtype, out_dtype)],
+        }
+        for (op, _, in_dtype, out_dtype, _, _, _, _) in MATH_WIDEN_OP_TYPE_PAIRS
+    },
 )
 
 filegroup(
@@ -81,5 +140,8 @@
     srcs = [
         ":rvv_{}_{}_m1.elf".format(op, dtype)
         for (op, _, _, dtype, _) in MATH_OP_TYPE_PAIRS + REDUCTION_OP_TYPE_PAIRS
+    ] + [
+        "rvv_widen_{}_{}_{}".format(op, in_dtype, out_dtype)
+        for (op, _, in_dtype, out_dtype, _, _, _, _) in MATH_WIDEN_OP_TYPE_PAIRS
     ],
 )
diff --git a/tests/cocotb/rvv/arithmetics/rvv_arithmetic.bzl b/tests/cocotb/rvv/arithmetics/rvv_arithmetic.bzl
index ff8ca27..9c8b56b 100644
--- a/tests/cocotb/rvv/arithmetics/rvv_arithmetic.bzl
+++ b/tests/cocotb/rvv/arithmetics/rvv_arithmetic.bzl
@@ -12,18 +12,24 @@
         **kwargs
     )
 
+def rvv_widen_arithmetic_test(**kwargs):
+    rvv_widen_arithmetic_template(
+        source_file = "{name}.cc".format(**kwargs),
+        **kwargs
+    )
+
 def rvv_arithmetic_template_impl(ctx):
     ctx.actions.expand_template(
         template = ctx.file._template,
         output = ctx.outputs.source_file,
         substitutions = {
-            "{DTYPE}" : ctx.attr.dtype,
-            "{IN_DATA_SIZE}" : ctx.attr.in_data_size,
-            "{OUT_DATA_SIZE}" : ctx.attr.out_data_size,
-            "{MATH_OP}" : ctx.attr.math_op,
-            "{NUM_OPERANDS}" : ctx.attr.num_operands,
-            "{SEW}" : ctx.attr.sew,
-            "{SIGN}" : ctx.attr.sign,
+            "{DTYPE}": ctx.attr.dtype,
+            "{IN_DATA_SIZE}": ctx.attr.in_data_size,
+            "{OUT_DATA_SIZE}": ctx.attr.out_data_size,
+            "{MATH_OP}": ctx.attr.math_op,
+            "{NUM_OPERANDS}": ctx.attr.num_operands,
+            "{SEW}": ctx.attr.sew,
+            "{SIGN}": ctx.attr.sign,
         },
     )
 
@@ -32,48 +38,83 @@
         template = ctx.file._template,
         output = ctx.outputs.source_file,
         substitutions = {
-            "{DTYPE}" : ctx.attr.dtype,
-            "{IN_DATA_SIZE}" : ctx.attr.in_data_size,
-            "{OUT_DATA_SIZE}" : ctx.attr.out_data_size,
-            "{REDUCTION_OP}" : ctx.attr.reduction_op,
-            "{NUM_OPERANDS}" : ctx.attr.num_operands,
-            "{SEW}" : ctx.attr.sew,
-            "{SIGN}" : ctx.attr.sign,
+            "{DTYPE}": ctx.attr.dtype,
+            "{IN_DATA_SIZE}": ctx.attr.in_data_size,
+            "{OUT_DATA_SIZE}": ctx.attr.out_data_size,
+            "{REDUCTION_OP}": ctx.attr.reduction_op,
+            "{NUM_OPERANDS}": ctx.attr.num_operands,
+            "{SEW}": ctx.attr.sew,
+            "{SIGN}": ctx.attr.sign,
+        },
+    )
+
+def rvv_widen_arithmetic_template_impl(ctx):
+    ctx.actions.expand_template(
+        template = ctx.file._template,
+        output = ctx.outputs.source_file,
+        substitutions = {
+            "{IN_DTYPE}": ctx.attr.in_dtype,
+            "{OUT_DTYPE}": ctx.attr.out_dtype,
+            "{IN_SEW}": ctx.attr.in_sew,
+            "{OUT_SEW}": ctx.attr.out_sew,
+            "{STEP_OPERANDS}": ctx.attr.step_operands,
+            "{MATH_OP}": ctx.attr.math_op,
+            "{SIGN}": ctx.attr.sign,
+            "{NUM_TEST_VALUES}": ctx.attr.num_test_values,
         },
     )
 
 rvv_arithmetic_template = rule(
     implementation = rvv_arithmetic_template_impl,
     attrs = {
-        "dtype" : attr.string(mandatory = True),
-        "in_data_size" : attr.string(mandatory = True),
-        "out_data_size" : attr.string(mandatory = True),
-        "math_op" : attr.string(mandatory = True),
-        "num_operands" : attr.string(mandatory = True),
-        "sew" : attr.string(mandatory = True),
-        "sign" : attr.string(mandatory = True),
+        "dtype": attr.string(mandatory = True),
+        "in_data_size": attr.string(mandatory = True),
+        "out_data_size": attr.string(mandatory = True),
+        "math_op": attr.string(mandatory = True),
+        "num_operands": attr.string(mandatory = True),
+        "sew": attr.string(mandatory = True),
+        "sign": attr.string(mandatory = True),
         "_template": attr.label(
             default = ":rvv_arithmetic_template.cc",
             allow_single_file = True,
         ),
-        "source_file": attr.output(mandatory=True)
-    }
+        "source_file": attr.output(mandatory = True),
+    },
 )
 
 rvv_reduction_template = rule(
     implementation = rvv_reduction_template_impl,
     attrs = {
-        "dtype" : attr.string(mandatory = True),
-        "in_data_size" : attr.string(mandatory = True),
-        "out_data_size" : attr.string(mandatory = True),
-        "reduction_op" : attr.string(mandatory = True),
-        "num_operands" : attr.string(mandatory = True),
-        "sew" : attr.string(mandatory = True),
-        "sign" : attr.string(mandatory = True),
+        "dtype": attr.string(mandatory = True),
+        "in_data_size": attr.string(mandatory = True),
+        "out_data_size": attr.string(mandatory = True),
+        "reduction_op": attr.string(mandatory = True),
+        "num_operands": attr.string(mandatory = True),
+        "sew": attr.string(mandatory = True),
+        "sign": attr.string(mandatory = True),
         "_template": attr.label(
             default = ":rvv_reduction_template.cc",
             allow_single_file = True,
         ),
-        "source_file": attr.output(mandatory=True)
-    }
-)
\ No newline at end of file
+        "source_file": attr.output(mandatory = True),
+    },
+)
+
+rvv_widen_arithmetic_template = rule(
+    implementation = rvv_widen_arithmetic_template_impl,
+    attrs = {
+        "in_dtype": attr.string(mandatory = True),
+        "out_dtype": attr.string(mandatory = True),
+        "math_op": attr.string(mandatory = True),
+        "step_operands": attr.string(mandatory = True),
+        "in_sew": attr.string(mandatory = True),
+        "out_sew": attr.string(mandatory = True),
+        "sign": attr.string(mandatory = True),
+        "num_test_values": attr.string(mandatory = True),
+        "_template": attr.label(
+            default = ":rvv_widen_arithmetic_template.cc",
+            allow_single_file = True,
+        ),
+        "source_file": attr.output(mandatory = True),
+    },
+)
diff --git a/tests/cocotb/rvv/arithmetics/rvv_widen_arithmetic_template.cc b/tests/cocotb/rvv/arithmetics/rvv_widen_arithmetic_template.cc
new file mode 100644
index 0000000..80f42e2
--- /dev/null
+++ b/tests/cocotb/rvv/arithmetics/rvv_widen_arithmetic_template.cc
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <riscv_vector.h>
+
+{IN_DTYPE}_t in_buf_1[{NUM_TEST_VALUES}] __attribute__((section(".data")))
+__attribute__((aligned(16)));
+{IN_DTYPE}_t in_buf_2[{NUM_TEST_VALUES}] __attribute__((section(".data")))
+__attribute__((aligned(16)));
+{OUT_DTYPE}_t out_buf_widen[{NUM_TEST_VALUES}]
+    __attribute__((section(".data"))) __attribute__((aligned(16)));
+
+// Todo vx and double widens as well
+
+void {MATH_OP}_widen_test(const {IN_DTYPE}_t* in_buf_1,
+                          const {IN_DTYPE}_t* in_buf_2,
+                          {OUT_DTYPE}_t* out_buf_widen) {
+  uint8_t num_operands = 4 * {STEP_OPERANDS};
+
+  for (int i = 0; i + num_operands <= {NUM_TEST_VALUES}; i += num_operands) {
+    v{IN_DTYPE}m2_t vin_buf_1 =
+        __riscv_vle{IN_SEW}_v_{SIGN}{IN_SEW}m2(in_buf_1 + i, num_operands);
+    v{IN_DTYPE}m2_t vin_buf_2 =
+        __riscv_vle{IN_SEW}_v_{SIGN}{IN_SEW}m2(in_buf_2 + i, num_operands);
+    v{OUT_DTYPE}m4_t vresult_widen =
+    __riscv_vw{MATH_OP}_vv_{SIGN}{OUT_SEW}m4(vin_buf_1, vin_buf_2, num_operands);
+
+    __riscv_vse{OUT_SEW}_v_{SIGN}{OUT_SEW}m1(
+        out_buf_widen + i + (0 * {STEP_OPERANDS}),
+        __riscv_vget_v_{SIGN}{OUT_SEW}m4_{SIGN}{OUT_SEW}m1(vresult_widen,
+                                                               0),
+        {STEP_OPERANDS});
+    __riscv_vse{OUT_SEW}_v_{SIGN}{OUT_SEW}m1(
+        out_buf_widen + i + (1 * {STEP_OPERANDS}),
+        __riscv_vget_v_{SIGN}{OUT_SEW}m4_{SIGN}{OUT_SEW}m1(vresult_widen,
+                                                               1),
+        {STEP_OPERANDS});
+    __riscv_vse{OUT_SEW}_v_{SIGN}{OUT_SEW}m1(
+        out_buf_widen + i + (2 * {STEP_OPERANDS}),
+        __riscv_vget_v_{SIGN}{OUT_SEW}m4_{SIGN}{OUT_SEW}m1(vresult_widen,
+                                                               2),
+        {STEP_OPERANDS});
+    __riscv_vse{OUT_SEW}_v_{SIGN}{OUT_SEW}m1(
+        out_buf_widen + i + (3 * {STEP_OPERANDS}),
+        __riscv_vget_v_{SIGN}{OUT_SEW}m4_{SIGN}{OUT_SEW}m1(vresult_widen,
+                                                               3),
+        {STEP_OPERANDS});
+    asm volatile("fence");
+  }
+}
+
+int main(int argc, char** argv) {
+  {MATH_OP}_widen_test(in_buf_1, in_buf_2, out_buf_widen);
+  return 0;
+}
diff --git a/tests/cocotb/rvv_arithmetic_cocotb_test.py b/tests/cocotb/rvv_arithmetic_cocotb_test.py
index d8eb73d..607064d 100644
--- a/tests/cocotb/rvv_arithmetic_cocotb_test.py
+++ b/tests/cocotb/rvv_arithmetic_cocotb_test.py
@@ -21,19 +21,26 @@
 from bazel_tools.tools.python.runfiles import runfiles
 from kelvin_test_utils.sim_test_fixture import Fixture
 
+STR_TO_NP_TYPE = {
+    "int8": np.int8,
+    "int16": np.int16,
+    "int32": np.int32,
+    "uint8": np.uint8,
+    "uint16": np.uint16,
+    "uint32": np.uint32,
+}
 
-def _get_math_result(x: np.array,
-                     y: np.array,
-                     symbol: str):
+
+def _get_math_result(x: np.array, y: np.array, symbol: str, dtype=None):
     if symbol == 'add':
-        return np.add(x, y)
+        return np.add(x, y, dtype=dtype)
     elif symbol == 'sub':
-        return np.subtract(x, y)
+        return np.subtract(x, y, dtype=dtype)
     elif symbol == 'mul':
-        return np.multiply(x,y)
+        return np.multiply(x, y, dtype=dtype)
     elif symbol == 'div':
         orig_settings = np.seterr(divide='ignore')
-        divide_output = np.divide(x, y)
+        divide_output = np.divide(x, y, dtype=dtype)
         np.seterr(**orig_settings)
         return divide_output
     elif symbol == 'redsum':
@@ -45,61 +52,63 @@
     raise ValueError(f"Unsupported math symbol: {symbol}")
 
 
-async def arithmetic_m1_vanilla_ops_test(dut,
-                          dtypes,
-                          math_ops: str,
-                          num_bytes: int):
-
+async def arithmetic_m1_vanilla_ops_test(dut, dtypes, math_ops: str,
+                                         num_bytes: int):
     """RVV arithmetic test template.
 
     Each test performs a math op loading `in_buf_1` and `in_buf_2` and storing the output to `out_buf`.
     """
-    str_to_np_type ={
-        "int8": np.int8,
-        "int16": np.int16,
-        "int32": np.int32,
-        "uint8": np.uint8,
-        "uint16": np.uint16,
-        "uint32": np.uint32,
-    }
-    m1_vanilla_op_elfs = [f"rvv_{math_op}_{dtype}_m1.elf" for math_op in math_ops for dtype in dtypes]
+    m1_vanilla_op_elfs = [
+        f"rvv_{math_op}_{dtype}_m1.elf" for math_op in math_ops
+        for dtype in dtypes
+    ]
     pattern_extract = re.compile("rvv_(.*)_(.*)_m1.elf")
 
-
     r = runfiles.Create()
     fixture = await Fixture.Create(dut)
     with tqdm.tqdm(m1_vanilla_op_elfs) as t:
         for elf_name in tqdm.tqdm(m1_vanilla_op_elfs):
             t.set_postfix({"binary": os.path.basename(elf_name)})
-            elf_path = r.Rlocation("kelvin_hw/tests/cocotb/rvv/arithmetics/" + elf_name)
+            elf_path = r.Rlocation("kelvin_hw/tests/cocotb/rvv/arithmetics/" +
+                                   elf_name)
             await fixture.load_elf_and_lookup_symbols(
                 elf_path,
                 ['in_buf_1', 'in_buf_2', 'out_buf'],
             )
             math_op, dtype = pattern_extract.match(elf_name).groups()
-            np_type = str_to_np_type[dtype]
-            num_values = int(num_bytes / np.dtype(np_type).itemsize)
+            np_type = STR_TO_NP_TYPE[dtype]
+            num_test_values = int(num_bytes / np.dtype(np_type).itemsize)
             min_value = np.iinfo(np_type).min
             max_value = np.iinfo(np_type).max + 1  # One above.
-            input_1 = np.random.randint(min_value, max_value, num_values, dtype=np_type)
-            input_2 = np.random.randint(min_value, max_value, num_values, dtype=np_type)
-            expected_output = np.asarray(_get_math_result(input_1, input_2, math_op), dtype=np_type)
+            input_1 = np.random.randint(min_value,
+                                        max_value,
+                                        num_test_values,
+                                        dtype=np_type)
+            input_2 = np.random.randint(min_value,
+                                        max_value,
+                                        num_test_values,
+                                        dtype=np_type)
+            expected_output = np.asarray(_get_math_result(
+                input_1, input_2, math_op),
+                                         dtype=np_type)
             if math_op == "div":
                 # riscv_vdiv clobbers divide by zero with -1
                 # riscv_vdivu clobbers divide by zero with max value of SEW
                 for idx, divisor in enumerate(input_2):
                     if divisor == 0 and dtype[:3] == "int":
                         expected_output[idx] = -1
-                    elif  divisor == 0 and dtype[:4] == "uint":
+                    elif divisor == 0 and dtype[:4] == "uint":
                         expected_output[idx] = max_value - 1
 
             await fixture.write('in_buf_1', input_1)
             await fixture.write('in_buf_2', input_2)
-            await fixture.write('out_buf', np.zeros([num_values], dtype=np_type))
+            await fixture.write('out_buf',
+                                np.zeros([num_test_values], dtype=np_type))
 
             await fixture.run_to_halt()
 
-            actual_output = (await fixture.read('out_buf', num_bytes)).view(np_type)
+            actual_output = (await fixture.read('out_buf',
+                                                num_bytes)).view(np_type)
             debug_msg = str({
                 'input_1': input_1,
                 'input_2': input_2,
@@ -109,32 +118,26 @@
 
             assert (actual_output == expected_output).all(), debug_msg
 
+
 @cocotb.test()
 async def arithmetic_m1_vanilla_ops(dut):
-    await arithmetic_m1_vanilla_ops_test(dut = dut,
-                          dtypes = ["int8", "int16", "int32", "uint8", "uint16", "uint32"],
-                          math_ops = ["add", "sub", "mul", "div"],
-                          num_bytes = 16)
+    await arithmetic_m1_vanilla_ops_test(
+        dut=dut,
+        dtypes=["int8", "int16", "int32", "uint8", "uint16", "uint32"],
+        math_ops=["add", "sub", "mul", "div"],
+        num_bytes=16)
 
 
-async def reduction_m1_vanilla_ops_test(dut,
-                          dtypes,
-                          math_ops: str,
-                          num_bytes: int):
-
+async def reduction_m1_vanilla_ops_test(dut, dtypes, math_ops: str,
+                                        num_bytes: int):
     """RVV reduction test template.
 
     Each test performs a reduction op loading `in_buf_1` and storing the output to `out_buf`.
     """
-    str_to_np_type ={
-        "int8": np.int8,
-        "int16": np.int16,
-        "int32": np.int32,
-        "uint8": np.uint8,
-        "uint16": np.uint16,
-        "uint32": np.uint32,
-    }
-    m1_vanilla_op_elfs = [f"rvv_{math_op}_{dtype}_m1.elf" for math_op in math_ops for dtype in dtypes]
+    m1_vanilla_op_elfs = [
+        f"rvv_{math_op}_{dtype}_m1.elf" for math_op in math_ops
+        for dtype in dtypes
+    ]
     pattern_extract = re.compile("rvv_(.*)_(.*)_m1.elf")
 
     r = runfiles.Create()
@@ -142,27 +145,34 @@
     with tqdm.tqdm(m1_vanilla_op_elfs) as t:
         for elf_name in tqdm.tqdm(m1_vanilla_op_elfs):
             t.set_postfix({"binary": os.path.basename(elf_name)})
-            elf_path = r.Rlocation(f"kelvin_hw/tests/cocotb/rvv/arithmetics/{elf_name}")
+            elf_path = r.Rlocation(
+                f"kelvin_hw/tests/cocotb/rvv/arithmetics/{elf_name}")
             await fixture.load_elf_and_lookup_symbols(
                 elf_path,
                 ['in_buf_1', 'scalar_input', 'out_buf'],
             )
             math_op, dtype = pattern_extract.match(elf_name).groups()
-            np_type = str_to_np_type[dtype]
+            np_type = STR_TO_NP_TYPE[dtype]
             itemsize = np.dtype(np_type).itemsize
-            num_values = int(num_bytes / np.dtype(np_type).itemsize)
+            num_test_values = int(num_bytes / np.dtype(np_type).itemsize)
             min_value = np.iinfo(np_type).min
             max_value = np.iinfo(np_type).max + 1  # One above.
-            input_1 = np.random.randint(min_value, max_value, num_values, dtype=np_type)
+            input_1 = np.random.randint(min_value,
+                                        max_value,
+                                        num_test_values,
+                                        dtype=np_type)
             input_2 = np.random.randint(min_value, max_value, 1, dtype=np_type)
-            expected_output = np.asarray(_get_math_result(input_1, input_2, math_op), dtype=np_type)
+            expected_output = np.asarray(_get_math_result(
+                input_1, input_2, math_op),
+                                         dtype=np_type)
 
             await fixture.write('in_buf_1', input_1)
             await fixture.write('scalar_input', input_2)
             await fixture.write('out_buf', np.zeros(1, dtype=np_type))
             await fixture.run_to_halt()
 
-            actual_output = (await fixture.read('out_buf', itemsize)).view(np_type)
+            actual_output = (await fixture.read('out_buf',
+                                                itemsize)).view(np_type)
             debug_msg = str({
                 'input_1': input_1,
                 'input_2': input_2,
@@ -171,9 +181,86 @@
             })
             assert (actual_output == expected_output).all(), debug_msg
 
+
 @cocotb.test()
 async def reduction_m1_vanilla_ops(dut):
-    await reduction_m1_vanilla_ops_test(dut = dut,
-                          dtypes = ["int8", "int16", "int32", "uint8", "uint16", "uint32"],
-                          math_ops = ["redsum", "redmin", "redmax"],
-                          num_bytes = 16)
+    await reduction_m1_vanilla_ops_test(
+        dut=dut,
+        dtypes=["int8", "int16", "int32", "uint8", "uint16", "uint32"],
+        math_ops=["redsum", "redmin", "redmax"],
+        num_bytes=16)
+
+
+async def _widen_math_ops_test_impl(
+    dut,
+    dtypes,
+    math_ops: str,
+    num_test_values: int = 256,
+):
+    """RVV widen arithmetic test template.
+
+    Each test performs a widen math op on 256 random inputs and stores into output buffer.
+    """
+    widen_op_elfs = [
+        f"rvv_widen_{math_op}_{in_dtype}_{out_dtype}.elf"
+        for math_op in math_ops for in_dtype, out_dtype in dtypes
+    ]
+    pattern_extract = re.compile("rvv_widen_(.*)_(.*)_(.*).elf")
+
+    r = runfiles.Create()
+    fixture = await Fixture.Create(dut)
+    with tqdm.tqdm(widen_op_elfs) as t:
+        for elf_name in tqdm.tqdm(widen_op_elfs):
+            t.set_postfix({"binary": os.path.basename(elf_name)})
+            elf_path = r.Rlocation("kelvin_hw/tests/cocotb/rvv/arithmetics/" +
+                                   elf_name)
+            await fixture.load_elf_and_lookup_symbols(
+                elf_path,
+                ['in_buf_1', 'in_buf_2', 'out_buf_widen'],
+            )
+            math_op, in_dtype, out_dtype = pattern_extract.match(
+                elf_name).groups()
+            in_np_type = STR_TO_NP_TYPE[in_dtype]
+            out_np_type = STR_TO_NP_TYPE[out_dtype]
+
+            min_value = np.iinfo(in_np_type).min
+            max_value = np.iinfo(in_np_type).max + 1  # One above.
+            input_1 = np.random.randint(min_value,
+                                        max_value,
+                                        num_test_values,
+                                        dtype=in_np_type)
+            input_2 = np.random.randint(min_value,
+                                        max_value,
+                                        num_test_values,
+                                        dtype=in_np_type)
+            expected_output = np.asarray(_get_math_result(input_1,
+                                                          input_2,
+                                                          math_op,
+                                                          dtype=out_np_type),
+                                         dtype=out_np_type)
+            await fixture.write('in_buf_1', input_1)
+            await fixture.write('in_buf_2', input_2)
+            await fixture.write('out_buf_widen',
+                                np.zeros([num_test_values], dtype=out_np_type))
+            await fixture.run_to_halt()
+
+            actual_output = (await fixture.read(
+                'out_buf_widen',
+                num_test_values *
+                np.dtype(out_np_type).itemsize)).view(out_np_type)
+            debug_msg = str({
+                'input_1': input_1,
+                'input_2': input_2,
+                'expected': expected_output,
+                'actual': actual_output,
+            })
+
+            assert (actual_output == expected_output).all(), debug_msg
+
+
+@cocotb.test()
+async def widen_math_ops_test_impl(dut):
+    await _widen_math_ops_test_impl(dut=dut,
+                                    dtypes=[['int8', 'int16'],
+                                            ['int16', 'int32']],
+                                    math_ops=['add', 'sub', 'mul'])