[otbn] Allow mulqacc to set flags

The bn.mulqacc.wo and bn.mulqacc.so instructions now set M, L and Z
flags in a specified flag group. We needed a little bit of shuffling
in the encoding to make this fit.

The changes are as follows:

 - There's now a flag group field at bit 31 (the same as other
   instructions that take a flag group)

 - The "wb" field, which chooses between the three writeback modes,
   has moved from bits 31-30 to bits 30-29 and has been split into two
   parts: "so" and "wb0". "so" is just set for bn.mulqacc.so (just like
   the top bit of "wb"). If "so" is not set, then "wb0" selects
   between bn.mulqacc and bn.mulqacc.wo. If "so" is set, "wb0" selects
   the destination halfword.

This commit includes all ISS, RTL and specification changes required to
implement the above.

Fixes #2979

Signed-off-by: Rupert Swarbrick <rswarbrick@lowrisc.org>
Signed-off-by: Greg Chadwick <gac@lowrisc.org>
diff --git a/hw/ip/otbn/data/bignum-insns.yml b/hw/ip/otbn/data/bignum-insns.yml
index 3cb5841..a855473 100644
--- a/hw/ip/otbn/data/bignum-insns.yml
+++ b/hw/ip/otbn/data/bignum-insns.yml
@@ -208,11 +208,10 @@
     [<zero_acc>] <wrs1>.<wrs1_qwsel>, <wrs2>.<wrs2_qwsel>, <acc_shift_imm>
   glued-ops: true
   doc: |
-    Multiplies two `WLEN/4` WDR values, shifts the product by `acc_shift_imm` bit, and adds the result to the accumulator.
+    Multiplies two `WLEN/4` WDR values, shifts the product by `acc_shift_imm` bits, and adds the result to the accumulator.
 
     For versions of the instruction with writeback, see `BN.MULQACC.WO` and `BN.MULQACC.SO`.
   decode: |
-    writeback_variant = None
     zero_accumulator = DecodeMulqaccZeroacc(zero_acc)
 
     d = None
@@ -222,7 +221,7 @@
     d_hwsel = None
     a_qwsel = DecodeQuarterWordSelect(wrs1_qwsel)
     b_qwsel = DecodeQuarterWordSelect(wrs2_qwsel)
-  operation: &mulqacc-operation |
+  operation: |
     a_qw = GetQuarterWord(a, a_qwsel)
     b_qw = GetQuarterWord(b, b_qwsel)
 
@@ -232,26 +231,17 @@
       ACC = 0
 
     ACC = ACC + (mul_res << acc_shift_imm)
-
-    if writeback_variant == 'shiftout':
-      if d_hwsel == 'L':
-        WDR[d][WLEN/2-1:0] = ACC[WLEN/2-1:0]
-      elif d_hwsel == 'U':
-        WDR[d][WLEN-1:WLEN/2] = ACC[WLEN/2-1:0]
-      ACC = ACC >> (WLEN/2)
-
-    elif writeback_variant == 'writeout':
-      WDR[d] = ACC
   encoding:
     scheme: bnaq
     mapping:
-      wb: b00
-      dh: bx
+      fg: bx
+      so: b0
+      wb0: b0
       qs2: wrs2_qwsel
       qs1: wrs1_qwsel
       wrs2: wrs2
       wrs1: wrs1
-      acc: acc_shift_imm
+      shift: acc_shift_imm
       z: zero_acc
       wrd: bxxxxx
 
@@ -267,14 +257,15 @@
     - *mulqacc-wrs2
     - *mulqacc-wrs2-qwsel
     - *mulqacc-acc-shift-imm
+    - *bn-flag-group-operand
   syntax: |
-    [<zero_acc>] <wrd>, <wrs1>.<wrs1_qwsel>, <wrs2>.<wrs2_qwsel>, <acc_shift_imm>
+    [<zero_acc>] <wrd>, <wrs1>.<wrs1_qwsel>, <wrs2>.<wrs2_qwsel>, <acc_shift_imm>[, FG<flag_group>]
   glued-ops: true
   doc: |
-    Multiplies two `WLEN/4` WDR values, shifts the product by `acc_shift_imm` bit, and adds the result to the accumulator.
+    Multiplies two `WLEN/4` WDR values, shifts the product by `acc_shift_imm` bits, and adds the result to the accumulator.
     Writes the resulting accumulator to `wrd`.
+    Updates the M, L and Z flags of `flag_group`.
   decode: |
-    writeback_variant = 'writeout'
     zero_accumulator = DecodeMulqaccZeroacc(zero_acc)
 
     d = UInt(wrd)
@@ -284,17 +275,34 @@
     d_hwsel = None
     a_qwsel = DecodeQuarterWordSelect(wrs1_qwsel)
     b_qwsel = DecodeQuarterWordSelect(wrs2_qwsel)
-  operation: *mulqacc-operation
+
+    fg = DecodeFlagGroup(flag_group)
+  operation: |
+    a_qw = GetQuarterWord(a, a_qwsel)
+    b_qw = GetQuarterWord(b, b_qwsel)
+
+    mul_res = a_qw * b_qw
+
+    if zero_accumulator:
+      ACC = 0
+
+    ACC = ACC + (mul_res << acc_shift_imm)
+
+    WDR[d] = ACC
+    FLAGS[fg].M = ACC[WLEN-1]
+    FLAGS[fg].L = ACC[0]
+    FLAGS[fg].Z = (ACC == 0)
   encoding:
     scheme: bnaq
     mapping:
-      wb: b01
-      dh: bx
+      fg: flag_group
+      so: b0
+      wb0: b1
       qs2: wrs2_qwsel
       qs1: wrs1_qwsel
       wrs2: wrs2
       wrs1: wrs1
-      acc: acc_shift_imm
+      shift: acc_shift_imm
       z: zero_acc
       wrd: wrd
 
@@ -313,17 +321,25 @@
     - *mulqacc-wrs2
     - *mulqacc-wrs2-qwsel
     - *mulqacc-acc-shift-imm
+    - *bn-flag-group-operand
   syntax: |
     [<zero_acc>] <wrd>.<wrd_hwsel>,
-    <wrs1>.<wrs1_qwsel>, <wrs2>.<wrs2_qwsel>, <acc_shift_imm>
+    <wrs1>.<wrs1_qwsel>, <wrs2>.<wrs2_qwsel>, <acc_shift_imm>[, FG<flag_group>]
   glued-ops: true
   doc: |
-    Multiplies two `WLEN/4` WDR values, shifts the product by `<acc_shift_imm>` and adds the result to the accumulator.
-    Next, shifts the resulting accumulator right by half a word.
-    The bits that are shifted out are written to a half-word of `<wrd>`, selected with `<wrd_hwsel>`.
+    Multiplies two `WLEN/4` WDR values, shifts the product by `acc_shift_imm` bits and adds the result to the accumulator.
+    Next, shifts the resulting accumulator right by half a word (128 bits).
+    The bits that are shifted out are written to a half-word of `wrd`, selected with `wrd_hwsel`.
 
+    This instruction never changes the `C` flag.
+    If `wrd_hwsel` is zero (so the instruction is updating the lower half-word of `wrd`), it updates the `L` and `Z` flags and leaves `M` unchanged.
+    The `L` flag is set iff the bottom bit of the shifted-out result is zero.
+    The `Z` flag is set iff the shifted-out result is zero.
+
+    If `wrd_hwsel` is one (so the instruction is updating the upper half-word of `wrd`), it updates the `M` and `Z` flags and leaves `L` unchanged.
+    The `M` flag is set iff the top bit of the shifted-out result is zero.
+    The `Z` flag is left unchanged if the shifted-out result is zero and cleared if not.
   decode: |
-    writeback_variant = 'shiftout'
     zero_accumulator = DecodeMulqaccZeroacc(zero_acc)
 
     d = UInt(wrd)
@@ -333,17 +349,42 @@
     d_hwsel = DecodeHalfWordSelect(wrd_hwsel)
     a_qwsel = DecodeQuarterWordSelect(wrs1_qwsel)
     b_qwsel = DecodeQuarterWordSelect(wrs2_qwsel)
-  operation: *mulqacc-operation
+
+    fg = DecodeFlagGroup(flag_group)
+  operation: |
+    a_qw = GetQuarterWord(a, a_qwsel)
+    b_qw = GetQuarterWord(b, b_qwsel)
+
+    mul_res = a_qw * b_qw
+
+    if zero_accumulator:
+      ACC = 0
+
+    ACC = ACC + (mul_res << acc_shift_imm)
+
+    shifted = ACC[WLEN/2-1:0]
+    ACC = ACC >> (WLEN/2)
+
+    if d_hwsel == 'L':
+      WDR[d][WLEN/2-1:0] = shifted
+      FLAGS[fg].L = shifted[0]
+      FLAGS[fg].Z = (shifted == 0)
+    elif d_hwsel == 'U':
+      WDR[d][WLEN-1:WLEN/2] = shifted
+      FLAGS[fg].M = shifted[WLEN/2-1]
+      if (shifted != 0):
+        FLAGS[fg].Z = 0
   encoding:
     scheme: bnaq
     mapping:
-      wb: b1x
-      dh: wrd_hwsel
+      fg: flag_group
+      so: b1
+      wb0: wrd_hwsel
       qs2: wrs2_qwsel
       qs1: wrs1_qwsel
       wrs2: wrs2
       wrs1: wrs1
-      acc: acc_shift_imm
+      shift: acc_shift_imm
       z: zero_acc
       wrd: wrd
 
diff --git a/hw/ip/otbn/data/enc-schemes.yml b/hw/ip/otbn/data/enc-schemes.yml
index 34f8ae3..0df2a01 100644
--- a/hw/ip/otbn/data/enc-schemes.yml
+++ b/hw/ip/otbn/data/enc-schemes.yml
@@ -269,12 +269,13 @@
   parents:
     - custom2
     - wdr3
+    - fg
   fields:
-    wb: 31-30
-    dh: 29
+    so: 30
+    wb0: 29
     qs2: 28-27
     qs1: 26-25
-    acc: 14-13
+    shift: 14-13
     z: 12
 
 # Unusual scheme used for bn.rshi (the immediate bleeds into the usual funct3
diff --git a/hw/ip/otbn/dv/otbnsim/sim/flags.py b/hw/ip/otbn/dv/otbnsim/sim/flags.py
index e07dd33..c195e23 100644
--- a/hw/ip/otbn/dv/otbnsim/sim/flags.py
+++ b/hw/ip/otbn/dv/otbnsim/sim/flags.py
@@ -22,8 +22,8 @@
 
     def __init__(self, C: bool, M: bool, L: bool, Z: bool):
         self.C = C
-        self.L = L
         self.M = M
+        self.L = L
         self.Z = Z
 
         self._new_val = None  # type: Optional['FlagReg']
diff --git a/hw/ip/otbn/dv/otbnsim/sim/insn.py b/hw/ip/otbn/dv/otbnsim/sim/insn.py
index c36791d..836bd4b 100644
--- a/hw/ip/otbn/dv/otbnsim/sim/insn.py
+++ b/hw/ip/otbn/dv/otbnsim/sim/insn.py
@@ -4,9 +4,10 @@
 
 from typing import Dict
 
-from .state import OTBNState
+from .flags import FlagReg
 from .isa import (OTBNInsn, RV32RegReg, RV32RegImm, RV32ImmShift,
                   insn_for_mnemonic, logical_byte_shift)
+from .state import OTBNState
 
 
 class ADD(RV32RegReg):
@@ -452,7 +453,7 @@
 
 
 class BNMULQACCWO(OTBNInsn):
-    insn = insn_for_mnemonic('bn.mulqacc.wo', 7)
+    insn = insn_for_mnemonic('bn.mulqacc.wo', 8)
 
     def __init__(self, op_vals: Dict[str, int]):
         super().__init__(op_vals)
@@ -463,6 +464,7 @@
         self.wrs2 = op_vals['wrs2']
         self.wrs2_qwsel = op_vals['wrs2_qwsel']
         self.acc_shift_imm = op_vals['acc_shift_imm']
+        self.flag_group = op_vals['flag_group']
 
     def execute(self, state: OTBNState) -> None:
         a_qw = state.get_quarter_word_unsigned(self.wrs1, self.wrs1_qwsel)
@@ -479,10 +481,11 @@
         truncated = acc & ((1 << 256) - 1)
         state.wdrs.get_reg(self.wrd).write_unsigned(truncated)
         state.wsrs.ACC.write_unsigned(truncated)
+        state.set_mlz_flags(self.flag_group, truncated)
 
 
 class BNMULQACCSO(OTBNInsn):
-    insn = insn_for_mnemonic('bn.mulqacc.so', 8)
+    insn = insn_for_mnemonic('bn.mulqacc.so', 9)
 
     def __init__(self, op_vals: Dict[str, int]):
         super().__init__(op_vals)
@@ -494,6 +497,7 @@
         self.wrs2 = op_vals['wrs2']
         self.wrs2_qwsel = op_vals['wrs2_qwsel']
         self.acc_shift_imm = op_vals['acc_shift_imm']
+        self.flag_group = op_vals['flag_group']
 
     def execute(self, state: OTBNState) -> None:
         a_qw = state.get_quarter_word_unsigned(self.wrs1, self.wrs1_qwsel)
@@ -513,6 +517,19 @@
         state.set_half_word_unsigned(self.wrd, self.wrd_hwsel, lo_part)
         state.wsrs.ACC.write_unsigned(hi_part)
 
+        old_flags = state.csrs.flags[self.flag_group]
+        if self.wrd_hwsel:
+            new_flags = FlagReg(C=old_flags.C,
+                                M=bool((lo_part >> 127) & 1),
+                                L=old_flags.L,
+                                Z=old_flags.Z and lo_part == 0)
+        else:
+            new_flags = FlagReg(C=old_flags.C,
+                                M=old_flags.M,
+                                L=bool(lo_part & 1),
+                                Z=lo_part == 0)
+        state.csrs.flags[self.flag_group] = new_flags
+
 
 class BNSUB(OTBNInsn):
     insn = insn_for_mnemonic('bn.sub', 6)
diff --git a/hw/ip/otbn/dv/tracer/rtl/otbn_trace_intf.sv b/hw/ip/otbn/dv/tracer/rtl/otbn_trace_intf.sv
index 63a28c0..8004dee 100644
--- a/hw/ip/otbn/dv/tracer/rtl/otbn_trace_intf.sv
+++ b/hw/ip/otbn/dv/tracer/rtl/otbn_trace_intf.sv
@@ -196,14 +196,21 @@
   flags_t                 flags_write_data [NFlagGroups];
   logic [NFlagGroups-1:0] flags_read;
   flags_t                 flags_read_data [NFlagGroups];
+  logic                   flag_group_read_op;
+
+  // Determine if current instruction reads a flag group specified in the instruction.
+  assign flag_group_read_op =
+      alu_bignum_operation.mac_flag_en                                                  |
+      (alu_bignum_operation.op inside {AluOpBignumAddc, AluOpBignumSubb, AluOpBignumSel,
+                                       AluOpBignumXor, AluOpBignumOr, AluOpBignumAnd,
+                                       AluOpBignumNot});
 
   for (genvar i_fg = 0; i_fg < NFlagGroups; i_fg++) begin : g_flag_group_acceses
     assign flags_write[i_fg] = u_otbn_alu_bignum.flags_en[i_fg];
     assign flags_write_data[i_fg] = u_otbn_alu_bignum.flags_d[i_fg];
 
     assign flags_read[i_fg] = (any_ispr_read & (ispr_addr == IsprFlags)) |
-        ((alu_bignum_operation.op inside {AluOpBignumAddc, AluOpBignumSubb, AluOpBignumSel}) &
-         (alu_bignum_operation.flag_group == i_fg) & insn_fetch_resp_valid);
+         (flag_group_read_op & (alu_bignum_operation.flag_group == i_fg) & insn_fetch_resp_valid);
 
     assign flags_read_data[i_fg] = u_otbn_alu_bignum.flags_q[i_fg];
   end
diff --git a/hw/ip/otbn/rtl/otbn_alu_bignum.sv b/hw/ip/otbn/rtl/otbn_alu_bignum.sv
index 3522aae..2227a5d 100644
--- a/hw/ip/otbn/rtl/otbn_alu_bignum.sv
+++ b/hw/ip/otbn/rtl/otbn_alu_bignum.sv
@@ -84,6 +84,9 @@
   output logic [WLEN-1:0]             ispr_acc_wr_data_o,
   output logic                        ispr_acc_wr_en_o,
 
+  input  flags_t                      mac_operation_flags_i,
+  input  flags_t                      mac_operation_flags_en_i,
+
   input  logic [WLEN-1:0]             rnd_i
 );
   ///////////
@@ -100,16 +103,25 @@
   logic                                adder_update_flags_en, adder_update_flags_en_raw;
   flags_t                              logic_update_flags;
   logic                                logic_update_flags_en, logic_update_flags_en_raw;
+  flags_t                              mac_update_flags;
+  logic                                mac_update_flags_en;
   logic                                ispr_update_flags_en;
 
-  assign adder_update_flags_en = operation_i.flag_en & adder_update_flags_en_raw;
-  assign logic_update_flags_en = operation_i.flag_en & logic_update_flags_en_raw;
+  assign adder_update_flags_en = operation_i.alu_flag_en & adder_update_flags_en_raw;
+  assign logic_update_flags_en = operation_i.alu_flag_en & logic_update_flags_en_raw;
+  assign mac_update_flags_en   = operation_i.mac_flag_en;
 
   assign ispr_update_flags_en = (ispr_base_wr_en_i[0] & (ispr_addr_i == IsprFlags));
 
 
   `ASSERT(UpdateFlagsOnehot,
-          $onehot0({adder_update_flags_en, logic_update_flags_en, ispr_update_flags_en}))
+          $onehot0({adder_update_flags_en, logic_update_flags_en, mac_update_flags_en,
+                    ispr_update_flags_en}))
+
+  assign selected_flags = flags_q[operation_i.flag_group];
+
+  assign mac_update_flags = (selected_flags        & ~mac_operation_flags_en_i) |
+                            (mac_operation_flags_i &  mac_operation_flags_en_i);
 
   for (genvar i_fg = 0; i_fg < NFlagGroups; i_fg++) begin : g_flag_groups
     always_ff @(posedge clk_i or negedge rst_ni) begin
@@ -132,6 +144,7 @@
       unique case (1'b1)
         adder_update_flags_en: flags_d[i_fg] = adder_update_flags;
         logic_update_flags_en: flags_d[i_fg] = logic_update_flags;
+        mac_update_flags_en:   flags_d[i_fg] = mac_update_flags;
         ispr_update_flags_en:  flags_d[i_fg] = ispr_base_wdata_i[i_fg * FlagsWidth +: FlagsWidth];
         default: ;
       endcase
@@ -139,10 +152,10 @@
 
     assign flags_en[i_fg] = ispr_update_flags_en |
       (adder_update_flags_en & is_operation_flag_group[i_fg]) |
-      (logic_update_flags_en & is_operation_flag_group[i_fg]);
+      (logic_update_flags_en & is_operation_flag_group[i_fg]) |
+      (mac_update_flags_en   & is_operation_flag_group[i_fg]);
   end
 
-  assign selected_flags = flags_q[operation_i.flag_group];
 
   logic [WLEN-1:0]             mod_q;
   logic [WLEN-1:0]             mod_d;
diff --git a/hw/ip/otbn/rtl/otbn_controller.sv b/hw/ip/otbn/rtl/otbn_controller.sv
index e93e3cd..b0c5e16 100644
--- a/hw/ip/otbn/rtl/otbn_controller.sv
+++ b/hw/ip/otbn/rtl/otbn_controller.sv
@@ -384,12 +384,14 @@
   assign alu_bignum_operation_o.shift_amt   = insn_dec_bignum_i.alu_shift_amt;
   assign alu_bignum_operation_o.flag_group  = insn_dec_bignum_i.alu_flag_group;
   assign alu_bignum_operation_o.sel_flag    = insn_dec_bignum_i.alu_sel_flag;
-  assign alu_bignum_operation_o.flag_en     = insn_dec_bignum_i.alu_flag_en;
+  assign alu_bignum_operation_o.alu_flag_en = insn_dec_bignum_i.alu_flag_en;
+  assign alu_bignum_operation_o.mac_flag_en = insn_dec_bignum_i.mac_flag_en;
 
   assign mac_bignum_operation_o.operand_a         = rf_bignum_rd_data_a_i;
   assign mac_bignum_operation_o.operand_b         = rf_bignum_rd_data_b_i;
   assign mac_bignum_operation_o.operand_a_qw_sel  = insn_dec_bignum_i.mac_op_a_qw_sel;
   assign mac_bignum_operation_o.operand_b_qw_sel  = insn_dec_bignum_i.mac_op_b_qw_sel;
+  assign mac_bignum_operation_o.wr_hw_sel_upper   = insn_dec_bignum_i.mac_wr_hw_sel_upper;
   assign mac_bignum_operation_o.pre_acc_shift_imm = insn_dec_bignum_i.mac_pre_acc_shift;
   assign mac_bignum_operation_o.zero_acc          = insn_dec_bignum_i.mac_zero_acc;
   assign mac_bignum_operation_o.shift_acc         = insn_dec_bignum_i.mac_shift_out;
@@ -407,8 +409,8 @@
     if (insn_valid_i && insn_dec_bignum_i.rf_we) begin
       if (insn_dec_bignum_i.mac_en && insn_dec_bignum_i.mac_shift_out) begin
         // Special handling for BN.MULQACC.SO, only enable upper or lower half depending on
-        // mac_wr_hw_sel.
-        rf_bignum_wr_en_o = insn_dec_bignum_i.mac_wr_hw_sel ? 2'b10 : 2'b01;
+        // mac_wr_hw_sel_upper.
+        rf_bignum_wr_en_o = insn_dec_bignum_i.mac_wr_hw_sel_upper ? 2'b10 : 2'b01;
       end else if (insn_dec_shared_i.ld_insn) begin
         // Special handling for BN.LID. Load data is requested in the first cycle of the instruction
         // (where state_q == OtbnStateRun) and is available in the second cycle following the
@@ -428,15 +430,15 @@
                                                                  insn_dec_bignum_i.d;
 
   // For the shift-out variant of BN.MULQACC the bottom half of the MAC result is written to one
-  // half of a desintation register specified by the instruction (mac_wr_hw_sel). The bottom half of
+  // half of a desintation register specified by the instruction (mac_wr_hw_sel_upper). The bottom half of
   // the MAC result must be placed in the appropriate half of the write data (the RF only accepts
   // write data for the top half in the top half of the write data input). Otherwise (shift-out to
   // bottom half and all other BN.MULQACC instructions) simply pass the MAC result through unchanged
   // as write data.
   assign mac_bignum_rf_wr_data[WLEN-1:WLEN/2] =
-    insn_dec_bignum_i.mac_wr_hw_sel &&
-    insn_dec_bignum_i.mac_shift_out    ? mac_bignum_operation_result_i[WLEN/2-1:0] :
-                                         mac_bignum_operation_result_i[WLEN-1:WLEN/2];
+    insn_dec_bignum_i.mac_wr_hw_sel_upper &&
+    insn_dec_bignum_i.mac_shift_out         ? mac_bignum_operation_result_i[WLEN/2-1:0] :
+                                              mac_bignum_operation_result_i[WLEN-1:WLEN/2];
 
   assign mac_bignum_rf_wr_data[WLEN/2-1:0] = mac_bignum_operation_result_i[WLEN/2-1:0];
 
diff --git a/hw/ip/otbn/rtl/otbn_core.sv b/hw/ip/otbn/rtl/otbn_core.sv
index 2542c83..ff75ef7 100644
--- a/hw/ip/otbn/rtl/otbn_core.sv
+++ b/hw/ip/otbn/rtl/otbn_core.sv
@@ -117,6 +117,8 @@
 
   mac_bignum_operation_t mac_bignum_operation;
   logic [WLEN-1:0]       mac_bignum_operation_result;
+  flags_t                mac_bignum_operation_flags;
+  flags_t                mac_bignum_operation_flags_en;
   logic                  mac_bignum_en;
 
   ispr_e                       ispr_addr;
@@ -356,29 +358,34 @@
     .clk_i,
     .rst_ni,
 
-    .operation_i         (alu_bignum_operation),
-    .operation_result_o  (alu_bignum_operation_result),
+    .operation_i              (alu_bignum_operation),
+    .operation_result_o       (alu_bignum_operation_result),
 
-    .ispr_addr_i         (ispr_addr),
-    .ispr_base_wdata_i   (ispr_base_wdata),
-    .ispr_base_wr_en_i   (ispr_base_wr_en),
-    .ispr_bignum_wdata_i (ispr_bignum_wdata),
-    .ispr_bignum_wr_en_i (ispr_bignum_wr_en),
-    .ispr_rdata_o        (ispr_rdata),
+    .ispr_addr_i              (ispr_addr),
+    .ispr_base_wdata_i        (ispr_base_wdata),
+    .ispr_base_wr_en_i        (ispr_base_wr_en),
+    .ispr_bignum_wdata_i      (ispr_bignum_wdata),
+    .ispr_bignum_wr_en_i      (ispr_bignum_wr_en),
+    .ispr_rdata_o             (ispr_rdata),
 
-    .ispr_acc_i          (ispr_acc),
-    .ispr_acc_wr_data_o  (ispr_acc_wr_data),
-    .ispr_acc_wr_en_o    (ispr_acc_wr_en),
+    .ispr_acc_i               (ispr_acc),
+    .ispr_acc_wr_data_o       (ispr_acc_wr_data),
+    .ispr_acc_wr_en_o         (ispr_acc_wr_en),
 
-    .rnd_i               (rnd)
+    .mac_operation_flags_i    (mac_bignum_operation_flags),
+    .mac_operation_flags_en_i (mac_bignum_operation_flags_en),
+
+    .rnd_i                    (rnd)
   );
 
   otbn_mac_bignum u_otbn_mac_bignum (
     .clk_i,
     .rst_ni,
 
-    .operation_i        (mac_bignum_operation),
-    .operation_result_o (mac_bignum_operation_result),
+    .operation_i          (mac_bignum_operation),
+    .operation_result_o   (mac_bignum_operation_result),
+    .operation_flags_o    (mac_bignum_operation_flags),
+    .operation_flags_en_o (mac_bignum_operation_flags_en),
 
     .mac_en_i           (mac_bignum_en),
 
diff --git a/hw/ip/otbn/rtl/otbn_decoder.sv b/hw/ip/otbn/rtl/otbn_decoder.sv
index 55d042b..ed3a336 100644
--- a/hw/ip/otbn/rtl/otbn_decoder.sv
+++ b/hw/ip/otbn/rtl/otbn_decoder.sv
@@ -79,7 +79,7 @@
 
   logic [1:0] mac_op_a_qw_sel_bignum;
   logic [1:0] mac_op_b_qw_sel_bignum;
-  logic       mac_wr_hw_sel_bignum;
+  logic       mac_wr_hw_sel_upper_bignum;
   logic [1:0] mac_pre_acc_shift_bignum;
   logic       mac_zero_acc_bignum;
   logic       mac_shift_out_bignum;
@@ -131,6 +131,7 @@
   assign alu_sel_flag_bignum = flag_e'(insn[26:25]);
 
   logic alu_flag_en_bignum;
+  logic mac_flag_en_bignum;
 
   // source registers
   assign insn_rs1 = insn[19:15];
@@ -149,12 +150,12 @@
   assign loop_bodysize_base  = insn[31:20];
   assign loop_immediate_base = insn[12];
 
-  assign mac_op_a_qw_sel_bignum   = insn[26:25];
-  assign mac_op_b_qw_sel_bignum   = insn[28:27];
-  assign mac_wr_hw_sel_bignum     = insn[29];
-  assign mac_pre_acc_shift_bignum = insn[14:13];
-  assign mac_zero_acc_bignum      = insn[12];
-  assign mac_shift_out_bignum     = insn[31];
+  assign mac_op_a_qw_sel_bignum     = insn[26:25];
+  assign mac_op_b_qw_sel_bignum     = insn[28:27];
+  assign mac_wr_hw_sel_upper_bignum = insn[29];
+  assign mac_pre_acc_shift_bignum   = insn[14:13];
+  assign mac_zero_acc_bignum        = insn[12];
+  assign mac_shift_out_bignum       = insn[30];
 
   logic d_inc_bignum;
   logic a_inc_bignum;
@@ -216,35 +217,36 @@
   };
 
   assign insn_dec_bignum_o = '{
-    a:                 insn_rs1,
-    b:                 insn_rs2,
-    d:                 insn_rd,
-    i:                 imm_i_type_bignum,
-    rf_a_indirect:     rf_a_indirect_bignum,
-    rf_b_indirect:     rf_b_indirect_bignum,
-    rf_d_indirect:     rf_d_indirect_bignum,
-    d_inc:             d_inc_bignum,
-    a_inc:             a_inc_bignum,
-    a_wlen_word_inc:   a_wlen_word_inc_bignum,
-    b_inc:             b_inc_bignum,
-    alu_shift_amt:     alu_shift_amt_bignum,
-    alu_shift_right:   alu_shift_right_bignum,
-    alu_flag_group:    alu_flag_group_bignum,
-    alu_sel_flag:      alu_sel_flag_bignum,
-    alu_flag_en:       alu_flag_en_bignum,
-    alu_op:            alu_operator_bignum,
-    alu_op_b_sel:      alu_op_b_mux_sel_bignum,
-    mac_op_a_qw_sel:   mac_op_a_qw_sel_bignum,
-    mac_op_b_qw_sel:   mac_op_b_qw_sel_bignum,
-    mac_wr_hw_sel:     mac_wr_hw_sel_bignum,
-    mac_pre_acc_shift: mac_pre_acc_shift_bignum,
-    mac_zero_acc:      mac_zero_acc_bignum,
-    mac_shift_out:     mac_shift_out_bignum,
-    mac_en:            mac_en_bignum,
-    rf_we:             rf_we_bignum,
-    rf_wdata_sel:      rf_wdata_sel_bignum,
-    rf_ren_a:          rf_ren_a_bignum,
-    rf_ren_b:          rf_ren_b_bignum
+    a:                   insn_rs1,
+    b:                   insn_rs2,
+    d:                   insn_rd,
+    i:                   imm_i_type_bignum,
+    rf_a_indirect:       rf_a_indirect_bignum,
+    rf_b_indirect:       rf_b_indirect_bignum,
+    rf_d_indirect:       rf_d_indirect_bignum,
+    d_inc:               d_inc_bignum,
+    a_inc:               a_inc_bignum,
+    a_wlen_word_inc:     a_wlen_word_inc_bignum,
+    b_inc:               b_inc_bignum,
+    alu_shift_amt:       alu_shift_amt_bignum,
+    alu_shift_right:     alu_shift_right_bignum,
+    alu_flag_group:      alu_flag_group_bignum,
+    alu_sel_flag:        alu_sel_flag_bignum,
+    alu_flag_en:         alu_flag_en_bignum,
+    mac_flag_en:         mac_flag_en_bignum,
+    alu_op:              alu_operator_bignum,
+    alu_op_b_sel:        alu_op_b_mux_sel_bignum,
+    mac_op_a_qw_sel:     mac_op_a_qw_sel_bignum,
+    mac_op_b_qw_sel:     mac_op_b_qw_sel_bignum,
+    mac_wr_hw_sel_upper: mac_wr_hw_sel_upper_bignum,
+    mac_pre_acc_shift:   mac_pre_acc_shift_bignum,
+    mac_zero_acc:        mac_zero_acc_bignum,
+    mac_shift_out:       mac_shift_out_bignum,
+    mac_en:              mac_en_bignum,
+    rf_we:               rf_we_bignum,
+    rf_wdata_sel:        rf_wdata_sel_bignum,
+    rf_ren_a:            rf_ren_a_bignum,
+    rf_ren_b:            rf_ren_b_bignum
   };
 
   assign insn_dec_shared_o = '{
@@ -633,7 +635,7 @@
         rf_wdata_sel_bignum = RfWdSelMac;
         mac_en_bignum       = 1'b1;
 
-        if (insn[31:30] != 2'b00) begin // BN.MULQACC.WO/BN.MULQACC.SO
+        if (insn[30] == 1'b1 || insn[29] == 1'b1) begin // BN.MULQACC.WO/BN.MULQACC.SO
           rf_we_bignum = 1'b1;
         end
       end
@@ -672,6 +674,7 @@
     opcode_alu               = insn_opcode_e'(insn_alu[6:0]);
 
     alu_flag_en_bignum       = 1'b0;
+    mac_flag_en_bignum       = 1'b0;
 
     unique case (opcode_alu)
       //////////////
@@ -907,6 +910,16 @@
           default: ;
         endcase
       end
+
+      ////////////////////////////////////////////
+      // BN.MULQACC/BN.MULQACC.WO/BN.MULQACC.SO //
+      ////////////////////////////////////////////
+
+      InsnOpcodeBignumMulqacc: begin
+        if (insn[30] == 1'b1 || insn[29] == 1'b1) begin // BN.MULQACC.WO/BN.MULQACC.SO
+          mac_flag_en_bignum = 1'b1;
+        end
+      end
     endcase
 
   end
diff --git a/hw/ip/otbn/rtl/otbn_mac_bignum.sv b/hw/ip/otbn/rtl/otbn_mac_bignum.sv
index 4de8e1b..ed667c0 100644
--- a/hw/ip/otbn/rtl/otbn_mac_bignum.sv
+++ b/hw/ip/otbn/rtl/otbn_mac_bignum.sv
@@ -14,6 +14,8 @@
   input logic                  mac_en_i,
 
   output logic [WLEN-1:0] operation_result_o,
+  output flags_t          operation_flags_o,
+  output flags_t          operation_flags_en_o,
 
   output logic [WLEN-1:0] ispr_acc_o,
   input  logic [WLEN-1:0] ispr_acc_wr_data_i,
@@ -25,6 +27,7 @@
   logic [WLEN-1:0] adder_op_a;
   logic [WLEN-1:0] adder_op_b;
   logic [WLEN-1:0] adder_result;
+  logic [1:0]      adder_result_hw_is_zero;
 
   logic [QWLEN-1:0]  mul_op_a;
   logic [QWLEN-1:0]  mul_op_b;
@@ -91,6 +94,39 @@
 
   assign adder_result = adder_op_a + adder_op_b;
 
+  // Split zero check between the two halves of the result. This is used for flag setting (see
+  // below).
+  assign adder_result_hw_is_zero[0] = adder_result[WLEN/2-1:0] == 'h0;
+  assign adder_result_hw_is_zero[1] = adder_result[WLEN/2+:WLEN/2] == 'h0;
+
+  assign operation_flags_o.L    = adder_result[0];
+  // L is always updated for .WO, and for .SO when writing to the lower half-word
+  assign operation_flags_en_o.L = operation_i.shift_acc ? ~operation_i.wr_hw_sel_upper :
+                                                          1'b1;
+
+  // For .SO M is taken from the top-bit of shifted out half-word, otherwise it is taken from the top-bit
+  // of the full result.
+  assign operation_flags_o.M    = operation_i.shift_acc ? adder_result[WLEN/2-1] :
+                                                          adder_result[WLEN-1];
+  // M is always updated for .WO, and for .SO when writing to the upper half-word.
+  assign operation_flags_en_o.M = operation_i.shift_acc ? operation_i.wr_hw_sel_upper :
+                                                          1'b1;
+
+  // For .SO Z is calculated from the shifted out half-word, otherwise it is calculated on the full result.
+  assign operation_flags_o.Z    = operation_i.shift_acc ? adder_result_hw_is_zero[0] :
+                                                          &adder_result_hw_is_zero;
+
+  // Z is updated for .WO. For .SO updates are based upon result and half-word:
+  // - When writing to lower half-word always update Z.
+  // - When writing to upper half-word clear Z if result is non-zero otherwise leave it alone.
+  assign operation_flags_en_o.Z =
+      operation_i.shift_acc & operation_i.wr_hw_sel_upper ? ~adder_result_hw_is_zero[0] :
+                                                            1'b1;
+
+  // MAC never sets the carry flag
+  assign operation_flags_o.C    = 1'b0;
+  assign operation_flags_en_o.C = 1'b0;
+
   // If performing an ACC ISPR write the next accumulator value is taken from the ISPR write data,
   // otherwise it is drawn from the adder result. The new accumulator can be optionally shifted
   // right by one half-word (shift_acc).
diff --git a/hw/ip/otbn/rtl/otbn_pkg.sv b/hw/ip/otbn/rtl/otbn_pkg.sv
index cb9713d..03d0123 100644
--- a/hw/ip/otbn/rtl/otbn_pkg.sv
+++ b/hw/ip/otbn/rtl/otbn_pkg.sv
@@ -282,12 +282,13 @@
     flag_group_t             alu_flag_group;
     flag_e                   alu_sel_flag;
     logic                    alu_flag_en;
+    logic                    mac_flag_en;
     alu_op_bignum_e          alu_op;
     op_b_sel_e               alu_op_b_sel;
 
     logic [1:0]              mac_op_a_qw_sel;
     logic [1:0]              mac_op_b_qw_sel;
-    logic                    mac_wr_hw_sel;
+    logic                    mac_wr_hw_sel_upper;
     logic [1:0]              mac_pre_acc_shift;
     logic                    mac_zero_acc;
     logic                    mac_shift_out;
@@ -319,7 +320,8 @@
     logic [$clog2(WLEN)-1:0] shift_amt;
     flag_group_t             flag_group;
     flag_e                   sel_flag;
-    logic                    flag_en;
+    logic                    alu_flag_en;
+    logic                    mac_flag_en;
   } alu_bignum_operation_t;
 
   typedef struct packed {
@@ -327,6 +329,7 @@
     logic [WLEN-1:0] operand_b;
     logic [1:0]      operand_a_qw_sel;
     logic [1:0]      operand_b_qw_sel;
+    logic            wr_hw_sel_upper;
     logic [1:0]      pre_acc_shift_imm;
     logic            zero_acc;
     logic            shift_acc;