[aes] Switch to multi-rail cipher core FSM

This commit switches the cipher core FSM to a multi-rail implementation
to protect the design against control-path FI. Some relevant control
signals use multi-bit encodings and every rail evaluates and drives one
bit of these signals only. To prevent synthesis optimizations and
resource sharing between individual rails, prim_buf cells are used as
optimization barriers.

Signed-off-by: Pirmin Vogel <vogelpi@lowrisc.org>
diff --git a/hw/ip/aes/aes.core b/hw/ip/aes/aes.core
index 3eb4a75..9d928cd 100644
--- a/hw/ip/aes/aes.core
+++ b/hw/ip/aes/aes.core
@@ -27,6 +27,9 @@
       - rtl/aes_sel_buf_chk.sv
       - rtl/aes_cipher_core.sv
       - rtl/aes_cipher_control.sv
+      - rtl/aes_cipher_control_fsm.sv
+      - rtl/aes_cipher_control_fsm_p.sv
+      - rtl/aes_cipher_control_fsm_n.sv
       - rtl/aes_sub_bytes.sv
       - rtl/aes_sbox.sv
       - rtl/aes_sbox_lut.sv
diff --git a/hw/ip/aes/lint/aes.vlt b/hw/ip/aes/lint/aes.vlt
index fa50924..6125013 100644
--- a/hw/ip/aes/lint/aes.vlt
+++ b/hw/ip/aes/lint/aes.vlt
@@ -40,4 +40,4 @@
 lint_off -rule UNOPTFLAT -file "*/rtl/aes_control.sv" -match "Signal unoptimizable: Feedback to clock or circular logic: '*u_aes_control.sp2v_sig[59:48]'"
 lint_off -rule UNOPTFLAT -file "*/rtl/aes_control.sv" -match "Signal unoptimizable: Feedback to clock or circular logic: '*u_aes_control.sp2v_sig[23:12]'"
 lint_off -rule UNOPTFLAT -file "*/rtl/aes_control.sv" -match "Signal unoptimizable: Feedback to clock or circular logic: '*u_aes_control.sp2v_sig[11:0]'"
-lint_off -rule UNOPTFLAT -file "*/rtl/aes_cipher_control.sv" -match "Signal unoptimizable: Feedback to clock or circular logic: '*u_aes_cipher_control.sp2v_sig[23:12]'"
+lint_off -rule UNOPTFLAT -file "*/rtl/aes_cipher_control_fsm_*.sv" -match "Signal unoptimizable: Feedback to clock or circular logic: '*u_aes_cipher_control_fsm_i.out'"
diff --git a/hw/ip/aes/rtl/aes_cipher_control.sv b/hw/ip/aes/rtl/aes_cipher_control.sv
index 3e0663a..30b3bf7 100644
--- a/hw/ip/aes/rtl/aes_cipher_control.sv
+++ b/hw/ip/aes/rtl/aes_cipher_control.sv
@@ -68,390 +68,250 @@
   output round_key_sel_e          round_key_sel_o
 );
 
-  // Types
-  // $ ./sparse-fsm-encode.py -d 3 -m 7 -n 6 \
-  //      -s 31468618 --language=sv
-  //
-  // Hamming distance histogram:
-  //
-  //  0: --
-  //  1: --
-  //  2: --
-  //  3: |||||||||||||||||||| (57.14%)
-  //  4: ||||||||||||||| (42.86%)
-  //  5: --
-  //  6: --
-  //
-  // Minimum Hamming distance: 3
-  // Maximum Hamming distance: 4
-  //
-  localparam int StateWidth = 6;
-  typedef enum logic [StateWidth-1:0] {
-    IDLE     = 6'b111100,
-    INIT     = 6'b101001,
-    ROUND    = 6'b010000,
-    FINISH   = 6'b100010,
-    CLEAR_S  = 6'b011011,
-    CLEAR_KD = 6'b110111,
-    ERROR    = 6'b001110
-  } aes_cipher_ctrl_e;
-
-  aes_cipher_ctrl_e aes_cipher_ctrl_ns, aes_cipher_ctrl_cs;
-
   // Signals
-  logic [3:0] rnd_ctr_d, rnd_ctr_q;
-  logic [3:0] rnd_ctr_rem_d, rnd_ctr_rem_q;
-  logic [3:0] rnd_ctr_sum;
-  logic [3:0] num_rounds_d, num_rounds_q;
-  logic [3:0] num_rounds_regular;
-  logic       rnd_ctr_parity, rnd_ctr_parity_d, rnd_ctr_parity_q;
-  logic       rnd_ctr_err, rnd_ctr_err_sum, rnd_ctr_err_parity;
-  sp2v_e      crypt_d, crypt_q;
-  sp2v_e      dec_key_gen_d, dec_key_gen_q;
-  logic       key_clear_d, key_clear_q;
-  logic       data_out_clear_d, data_out_clear_q;
-  logic       prng_reseed_done_d, prng_reseed_done_q;
-  sp2v_e      sub_bytes_out_req;
-  sp2v_e      key_expand_out_req;
-  sp2v_e      advance, advance_chk;
-  sp2v_e      in_valid;
-  sp2v_e      out_ready;
-  sp2v_e      crypt;
-  sp2v_e      dec_key_gen;
-  logic       sp_enc_err;
+  logic                          [3:0] rnd_ctr_d, rnd_ctr_q;
+  logic                          [3:0] rnd_ctr_rem_d, rnd_ctr_rem_q;
+  logic                          [3:0] rnd_ctr_sum;
+  logic                          [3:0] num_rounds_d, num_rounds_q;
+  logic                                rnd_ctr_parity, rnd_ctr_parity_d, rnd_ctr_parity_q;
+  logic                                rnd_ctr_err, rnd_ctr_err_sum, rnd_ctr_err_parity;
+  sp2v_e                               crypt_d, crypt_q;
+  sp2v_e                               dec_key_gen_d, dec_key_gen_q;
+  logic                                key_clear_d, key_clear_q;
+  logic                                data_out_clear_d, data_out_clear_q;
+  sp2v_e                               sub_bytes_out_req;
+  sp2v_e                               key_expand_out_req;
+  sp2v_e                               in_valid;
+  sp2v_e                               out_ready;
+  sp2v_e                               crypt;
+  sp2v_e                               dec_key_gen;
+  logic                                mux_sel_err;
+  logic                                mr_err;
+  logic                                sp_enc_err;
 
-  // cfg_valid_i is used for gating assertions only.
-  logic       unused_cfg_valid;
-  assign unused_cfg_valid = cfg_valid_i;
+  // Multi-rail signals. These are outputs of the single-rail FSMs and need combining.
+  logic           [Sp2VWidth-1:0]      mr_alert;
+  logic           [Sp2VWidth-1:0]      mr_prng_update;
+  logic           [Sp2VWidth-1:0]      mr_prng_reseed_req;
+  logic           [Sp2VWidth-1:0]      mr_key_expand_clear;
+  logic           [Sp2VWidth-1:0]      mr_key_clear_d;
+  logic           [Sp2VWidth-1:0]      mr_data_out_clear_d;
 
-  // FSM
-  always_comb begin : aes_cipher_ctrl_fsm
+  state_sel_e     [Sp2VWidth-1:0]      mr_state_sel;
+  add_rk_sel_e    [Sp2VWidth-1:0]      mr_add_rk_sel;
+  key_full_sel_e  [Sp2VWidth-1:0]      mr_key_full_sel;
+  key_dec_sel_e   [Sp2VWidth-1:0]      mr_key_dec_sel;
+  key_words_sel_e [Sp2VWidth-1:0]      mr_key_words_sel;
+  round_key_sel_e [Sp2VWidth-1:0]      mr_round_key_sel;
 
-    // Handshake signals
-    in_ready_o           = SP2V_LOW;
-    out_valid_o          = SP2V_LOW;
+  logic           [Sp2VWidth-1:0][3:0] mr_rnd_ctr_d;
+  logic           [Sp2VWidth-1:0][3:0] mr_rnd_ctr_rem_d;
+  logic           [Sp2VWidth-1:0][3:0] mr_num_rounds_d;
 
-    // Masking PRNG signals
-    prng_update_o        = 1'b0;
-    prng_reseed_req_o    = 1'b0;
+  /////////
+  // FSM //
+  /////////
+  // For every bit in the Sp2V signals, one separate rail is instantiated. The inputs and outputs
+  // of every rail are buffered to prevent aggressive synthesis optimizations.
+  for (genvar i = 0; i < Sp2VWidth; i++) begin : gen_fsm
+    if (SP2V_HIGH[i] == 1'b1) begin : gen_fsm_p
+      aes_cipher_control_fsm_p #(
+        .Masking  ( Masking  ),
+        .SBoxImpl ( SBoxImpl )
+      ) u_aes_cipher_control_fsm_i (
+        .clk_i                 ( clk_i                    ),
+        .rst_ni                ( rst_ni                   ),
 
-    // Cipher data path
-    state_sel_o          = STATE_ROUND;
-    state_we_o           = SP2V_LOW;
-    add_rk_sel_o         = ADD_RK_ROUND;
-    sub_bytes_en_o       = SP2V_LOW;
-    sub_bytes_out_ack_o  = SP2V_LOW;
+        .in_valid_i            ( in_valid[i]              ), // Sparsified
+        .in_ready_o            ( in_ready_o[i]            ), // Sparsified
 
-    // Key expand data path
-    key_full_sel_o       = KEY_FULL_ROUND;
-    key_full_we_o        = SP2V_LOW;
-    key_dec_sel_o        = KEY_DEC_EXPAND;
-    key_dec_we_o         = SP2V_LOW;
-    key_expand_en_o      = SP2V_LOW;
-    key_expand_out_ack_o = SP2V_LOW;
-    key_expand_clear_o   = 1'b0;
-    key_words_sel_o      = KEY_WORDS_ZERO;
-    round_key_sel_o      = ROUND_KEY_DIRECT;
+        .out_valid_o           ( out_valid_o[i]           ), // Sparsified
+        .out_ready_i           ( out_ready[i]             ), // Sparsified
 
-    // FSM
-    aes_cipher_ctrl_ns   = aes_cipher_ctrl_cs;
-    num_rounds_d         = num_rounds_q;
-    rnd_ctr_d            = rnd_ctr_q;
-    rnd_ctr_rem_d        = rnd_ctr_rem_q;
-    crypt_d              = crypt_q;
-    dec_key_gen_d        = dec_key_gen_q;
-    key_clear_d          = key_clear_q;
-    data_out_clear_d     = data_out_clear_q;
-    prng_reseed_done_d   = prng_reseed_done_q | prng_reseed_ack_i;
-    advance              = SP2V_LOW;
+        .cfg_valid_i           ( cfg_valid_i              ),
+        .op_i                  ( op_i                     ),
+        .key_len_i             ( key_len_i                ),
+        .crypt_i               ( crypt[i]                 ), // Sparsified
+        .dec_key_gen_i         ( dec_key_gen[i]           ), // Sparsified
+        .key_clear_i           ( key_clear_i              ),
+        .data_out_clear_i      ( data_out_clear_i         ),
+        .mux_sel_err_i         ( mux_sel_err              ),
+        .sp_enc_err_i          ( sp_enc_err               ),
+        .rnd_ctr_err_i         ( rnd_ctr_err              ),
+        .alert_o               ( mr_alert[i]              ), // OR-combine
 
-    // Alert
-    alert_o              = 1'b0;
+        .prng_update_o         ( mr_prng_update[i]        ), // OR-combine
+        .prng_reseed_req_o     ( mr_prng_reseed_req[i]    ), // OR-combine
+        .prng_reseed_ack_i     ( prng_reseed_ack_i        ),
 
-    unique case (aes_cipher_ctrl_cs)
+        .state_sel_o           ( mr_state_sel[i]          ), // OR-combine
+        .state_we_o            ( state_we_o[i]            ), // Sparsified
+        .sub_bytes_en_o        ( sub_bytes_en_o[i]        ), // Sparsified
+        .sub_bytes_out_req_i   ( sub_bytes_out_req[i]     ), // Sparsified
+        .sub_bytes_out_ack_o   ( sub_bytes_out_ack_o[i]   ), // Sparsified
+        .add_rk_sel_o          ( mr_add_rk_sel[i]         ), // OR-combine
 
-      IDLE: begin
-        dec_key_gen_d = SP2V_LOW;
+        .key_full_sel_o        ( mr_key_full_sel[i]       ), // OR-combine
+        .key_full_we_o         ( key_full_we_o[i]         ), // Sparsified
+        .key_dec_sel_o         ( mr_key_dec_sel[i]        ), // OR-combine
+        .key_dec_we_o          ( key_dec_we_o[i]          ), // Sparsified
+        .key_expand_en_o       ( key_expand_en_o[i]       ), // Sparsified
+        .key_expand_out_req_i  ( key_expand_out_req[i]    ), // Sparsified
+        .key_expand_out_ack_o  ( key_expand_out_ack_o[i]  ), // Sparsified
+        .key_expand_clear_o    ( mr_key_expand_clear[i]   ), // OR-combine
+        .key_words_sel_o       ( mr_key_words_sel[i]      ), // OR-combine
+        .round_key_sel_o       ( mr_round_key_sel[i]      ), // OR-combine
 
-        // Signal that we are ready, wait for handshake.
-        in_ready_o = SP2V_HIGH;
-        if (in_valid == SP2V_HIGH) begin
-          if (key_clear_i || data_out_clear_i) begin
-            // Clear internal key registers. The cipher core muxes are used to clear the data
-            // output registers.
-            key_clear_d      = key_clear_i;
-            data_out_clear_d = data_out_clear_i;
+        .rnd_ctr_q_i           ( rnd_ctr_q                ),
+        .rnd_ctr_d_o           ( mr_rnd_ctr_d[i]          ), // OR-combine
+        .rnd_ctr_rem_q_i       ( rnd_ctr_rem_q            ),
+        .rnd_ctr_rem_d_o       ( mr_rnd_ctr_rem_d[i]      ), // OR-combine
+        .num_rounds_q_i        ( num_rounds_q             ),
+        .num_rounds_d_o        ( mr_num_rounds_d[i]       ), // OR-combine
+        .crypt_q_i             ( crypt_q[i]               ), // Sparsified
+        .crypt_d_o             ( crypt_d[i]               ), // Sparsified
+        .dec_key_gen_q_i       ( dec_key_gen_q[i]         ), // Sparsified
+        .dec_key_gen_d_o       ( dec_key_gen_d[i]         ), // Sparsified
+        .key_clear_q_i         ( key_clear_q              ),
+        .key_clear_d_o         ( mr_key_clear_d[i]        ), // AND-combine
+        .data_out_clear_q_i    ( data_out_clear_q         ),
+        .data_out_clear_d_o    ( mr_data_out_clear_d[i]   )  // AND-combine
+      );
+    end else begin : gen_fsm_n
+      aes_cipher_control_fsm_n #(
+        .Masking  ( Masking  ),
+        .SBoxImpl ( SBoxImpl )
+      ) u_aes_cipher_control_fsm_i (
+        .clk_i                 ( clk_i                    ),
+        .rst_ni                ( rst_ni                   ),
 
-            // To clear the data output registers, we must first clear the state.
-            aes_cipher_ctrl_ns = data_out_clear_i ? CLEAR_S : CLEAR_KD;
+        .in_valid_ni           ( in_valid[i]              ), // Sparsified
+        .in_ready_no           ( in_ready_o[i]            ), // Sparsified
 
-          end else if (dec_key_gen == SP2V_HIGH || crypt == SP2V_HIGH) begin
-            // Start encryption/decryption or generation of start key for decryption.
-            crypt_d       = (dec_key_gen_i == SP2V_LOW) ? crypt : SP2V_LOW;
-            dec_key_gen_d =  dec_key_gen_i;
+        .out_valid_no          ( out_valid_o[i]           ), // Sparsified
+        .out_ready_ni          ( out_ready[i]             ), // Sparsified
 
-            // Load input data to state
-            state_sel_o = (dec_key_gen_i == SP2V_HIGH) ? STATE_CLEAR : STATE_INIT;
-            state_we_o  = SP2V_HIGH;
+        .cfg_valid_i           ( cfg_valid_i              ),
+        .op_i                  ( op_i                     ),
+        .key_len_i             ( key_len_i                ),
+        .crypt_ni              ( crypt[i]                 ), // Sparsified
+        .dec_key_gen_ni        ( dec_key_gen[i]           ), // Sparsified
+        .key_clear_i           ( key_clear_i              ),
+        .data_out_clear_i      ( data_out_clear_i         ),
+        .mux_sel_err_i         ( mux_sel_err_i            ),
+        .sp_enc_err_i          ( sp_enc_err               ),
+        .rnd_ctr_err_i         ( rnd_ctr_err              ),
+        .alert_o               ( mr_alert[i]              ), // OR-combine
 
-            // Make the masking PRNG advance. The current pseudo-random data is used to mask the
-            // input data.
-            prng_update_o = (dec_key_gen_i == SP2V_HIGH) ? 1'b0 : Masking;
+        .prng_update_o         ( mr_prng_update[i]        ), // OR-combine
+        .prng_reseed_req_o     ( mr_prng_reseed_req[i]    ), // OR-combine
+        .prng_reseed_ack_i     ( prng_reseed_ack_i        ),
 
-            // Init key expand
-            key_expand_clear_o = 1'b1;
+        .state_sel_o           ( mr_state_sel[i]          ), // OR-combine
+        .state_we_no           ( state_we_o[i]            ), // Sparsified
+        .sub_bytes_en_no       ( sub_bytes_en_o[i]        ), // Sparsified
+        .sub_bytes_out_req_ni  ( sub_bytes_out_req[i]     ), // Sparsified
+        .sub_bytes_out_ack_no  ( sub_bytes_out_ack_o[i]   ), // Sparsified
+        .add_rk_sel_o          ( mr_add_rk_sel[i]         ), // OR-combine
 
-            // Load full key
-            key_full_sel_o = (dec_key_gen_i == SP2V_HIGH) ? KEY_FULL_ENC_INIT :
-                                       (op_i == CIPH_FWD) ? KEY_FULL_ENC_INIT :
-                                                            KEY_FULL_DEC_INIT;
-            key_full_we_o  = SP2V_HIGH;
+        .key_full_sel_o        ( mr_key_full_sel[i]       ), // OR-combine
+        .key_full_we_no        ( key_full_we_o[i]         ), // Sparsified
+        .key_dec_sel_o         ( mr_key_dec_sel[i]        ), // OR-combine
+        .key_dec_we_no         ( key_dec_we_o[i]          ), // Sparsified
+        .key_expand_en_no      ( key_expand_en_o[i]       ), // Sparsified
+        .key_expand_out_req_ni ( key_expand_out_req[i]    ), // Sparsified
+        .key_expand_out_ack_no ( key_expand_out_ack_o[i]  ), // Sparsified
+        .key_expand_clear_o    ( mr_key_expand_clear[i]   ), // OR-combine
+        .key_words_sel_o       ( mr_key_words_sel[i]      ), // OR-combine
+        .round_key_sel_o       ( mr_round_key_sel[i]      ), // OR-combine
 
-            // Load num_rounds, initialize round counters.
-            num_rounds_d = (key_len_i == AES_128) ? 4'd10 :
-                           (key_len_i == AES_192) ? 4'd12 :
-                                                    4'd14;
-            rnd_ctr_rem_d      = num_rounds_d;
-            rnd_ctr_d          = '0;
-            aes_cipher_ctrl_ns = INIT;
-
-          end else begin
-            // Handshake without a valid command. We should never get here. If we do (e.g. via a
-            // malicious glitch), error out immediately.
-            aes_cipher_ctrl_ns = ERROR;
-          end
-        end
-      end
-
-      INIT: begin
-        // Initial round: just add key to state
-        add_rk_sel_o = ADD_RK_INIT;
-
-        // Select key words for initial add_round_key
-        key_words_sel_o = (dec_key_gen_q == SP2V_HIGH) ? KEY_WORDS_ZERO :
-            (key_len_i == AES_128)                     ? KEY_WORDS_0123 :
-            (key_len_i == AES_192 && op_i == CIPH_FWD) ? KEY_WORDS_0123 :
-            (key_len_i == AES_192 && op_i == CIPH_INV) ? KEY_WORDS_2345 :
-            (key_len_i == AES_256 && op_i == CIPH_FWD) ? KEY_WORDS_0123 :
-            (key_len_i == AES_256 && op_i == CIPH_INV) ? KEY_WORDS_4567 : KEY_WORDS_ZERO;
-
-        // Clear masking PRNG reseed status.
-        prng_reseed_done_d = 1'b0;
-
-        // AES-256 has two round keys available right from beginning. Pseudo-random data is
-        // required by KeyExpand only.
-        if (key_len_i != AES_256) begin
-          // Advance in sync with KeyExpand. Based on the S-Box implementation, it can take
-          // multiple cycles to finish. Wait for handshake. The DOM S-Boxes take fresh PRD
-          // in every cycle except the last.
-          advance         = key_expand_out_req;
-          prng_update_o   = (SBoxImpl == SBoxImplDom) ? (advance_chk == SP2V_LOW) : Masking;
-          key_expand_en_o = SP2V_HIGH;
-          if (advance_chk == SP2V_HIGH) begin
-            key_expand_out_ack_o = SP2V_HIGH;
-            state_we_o           = (dec_key_gen_q == SP2V_LOW) ? SP2V_HIGH : SP2V_LOW;
-            key_full_we_o        = SP2V_HIGH;
-            rnd_ctr_d            = rnd_ctr_q     + 4'b0001;
-            rnd_ctr_rem_d        = rnd_ctr_rem_q - 4'b0001;
-            aes_cipher_ctrl_ns   = ROUND;
-          end
-        end else begin
-          state_we_o         = (dec_key_gen_q == SP2V_LOW) ? SP2V_HIGH : SP2V_LOW;
-          rnd_ctr_d          = rnd_ctr_q     + 4'b0001;
-          rnd_ctr_rem_d      = rnd_ctr_rem_q - 4'b0001;
-          aes_cipher_ctrl_ns = ROUND;
-        end
-      end
-
-      ROUND: begin
-        // Normal rounds
-
-        // Select key words for add_round_key
-        key_words_sel_o = (dec_key_gen_q == SP2V_HIGH) ? KEY_WORDS_ZERO :
-            (key_len_i == AES_128)                     ? KEY_WORDS_0123 :
-            (key_len_i == AES_192 && op_i == CIPH_FWD) ? KEY_WORDS_2345 :
-            (key_len_i == AES_192 && op_i == CIPH_INV) ? KEY_WORDS_0123 :
-            (key_len_i == AES_256 && op_i == CIPH_FWD) ? KEY_WORDS_4567 :
-            (key_len_i == AES_256 && op_i == CIPH_INV) ? KEY_WORDS_0123 : KEY_WORDS_ZERO;
-
-        // Keep requesting PRNG reseeding until it is acknowledged.
-        prng_reseed_req_o = Masking & ~prng_reseed_done_q;
-
-        // Select round key: direct or mixed (equivalent inverse cipher)
-        round_key_sel_o = (op_i == CIPH_FWD) ? ROUND_KEY_DIRECT : ROUND_KEY_MIXED;
-
-        // Advance in sync with SubBytes and KeyExpand. Based on the S-Box implementation, both can
-        // take multiple cycles to finish. Wait for handshake. Make the masking PRNG advance every
-        // cycle. The DOM S-Boxes take fresh PRD in every cycle except the last.
-        advance         = ((dec_key_gen_q == SP2V_HIGH || sub_bytes_out_req == SP2V_HIGH) &&
-            key_expand_out_req == SP2V_HIGH) ? SP2V_HIGH : SP2V_LOW;
-        prng_update_o   = (SBoxImpl == SBoxImplDom) ? (advance_chk == SP2V_LOW) : Masking;
-        sub_bytes_en_o  = (dec_key_gen_q == SP2V_LOW) ? SP2V_HIGH : SP2V_LOW;
-        key_expand_en_o = SP2V_HIGH;
-        if (advance_chk == SP2V_HIGH) begin
-          sub_bytes_out_ack_o  = (dec_key_gen_q == SP2V_LOW) ? SP2V_HIGH : SP2V_LOW;
-          key_expand_out_ack_o = SP2V_HIGH;
-
-          state_we_o    = (dec_key_gen_q == SP2V_LOW) ? SP2V_HIGH : SP2V_LOW;
-          key_full_we_o = SP2V_HIGH;
-
-          // Update round
-          rnd_ctr_d     = rnd_ctr_q     + 4'b0001;
-          rnd_ctr_rem_d = rnd_ctr_rem_q - 4'b0001;
-
-          // Are we doing the last regular round?
-          if (rnd_ctr_q == num_rounds_regular) begin
-            aes_cipher_ctrl_ns = FINISH;
-
-            if (dec_key_gen_q == SP2V_HIGH) begin
-              // Write decryption key.
-              key_dec_we_o = SP2V_HIGH;
-
-              // Indicate that we are done, try to perform the handshake. But we don't wait here.
-              // If we don't get the handshake now, we will wait in the finish state. When using
-              // masking, we only finish if the masking PRNG has been reseeded.
-              out_valid_o = Masking ? (prng_reseed_done_q ? SP2V_HIGH : SP2V_LOW) : SP2V_HIGH;
-              if (out_valid_o == SP2V_HIGH && out_ready == SP2V_HIGH) begin
-                // Go to idle state directly.
-                dec_key_gen_d      = SP2V_LOW;
-                aes_cipher_ctrl_ns = IDLE;
-              end
-            end
-          end // rnd_ctr_q
-        end // SubBytes/KeyExpand REQ/ACK
-      end
-
-      FINISH: begin
-        // Final round
-
-        // Select key words for add_round_key
-        key_words_sel_o = (dec_key_gen_q == SP2V_HIGH) ? KEY_WORDS_ZERO :
-            (key_len_i == AES_128)                     ? KEY_WORDS_0123 :
-            (key_len_i == AES_192 && op_i == CIPH_FWD) ? KEY_WORDS_2345 :
-            (key_len_i == AES_192 && op_i == CIPH_INV) ? KEY_WORDS_0123 :
-            (key_len_i == AES_256 && op_i == CIPH_FWD) ? KEY_WORDS_4567 :
-            (key_len_i == AES_256 && op_i == CIPH_INV) ? KEY_WORDS_0123 : KEY_WORDS_ZERO;
-
-        // Skip mix_columns
-        add_rk_sel_o = ADD_RK_FINAL;
-
-        // Keep requesting PRNG reseeding until it is acknowledged.
-        prng_reseed_req_o = Masking & ~prng_reseed_done_q;
-
-        // Once we're done, we won't need the state anymore. We actually clear it when progressing
-        // to the next state.
-        state_sel_o = STATE_CLEAR;
-
-        // Advance in sync with SubBytes. Based on the S-Box implementation, it can take multiple
-        // cycles to finish. Only indicate that we are done if:
-        // - we have valid output (SubBytes finished),
-        // - the masking PRNG has been reseeded (if masking is used),
-        // - all mux selector signals are valid (don't release data in case of errors), and
-        // - all sparsely encoded signals are valid (don't release data in case of errors).
-        // Perform both handshakes simultaneously.
-        advance = (sub_bytes_out_req == SP2V_HIGH ||
-                          dec_key_gen_q == SP2V_HIGH) ? SP2V_HIGH : SP2V_LOW;
-        sub_bytes_en_o = (dec_key_gen_q == SP2V_LOW)  ? SP2V_HIGH : SP2V_LOW;
-        out_valid_o = (advance_chk == SP2V_HIGH &&
-            Masking == prng_reseed_done_q &&
-                       !mux_sel_err_i && !sp_enc_err) ? SP2V_HIGH : SP2V_LOW;
-        // When using DOM S-Boxes, make the masking PRNG advance every cycle until the output is
-        // ready. For other S-Boxes, make it advance once only. Updating it while being stalled
-        // would cause non-DOM S-Boxes to be re-evaluated, thereby creating additional SCA leakage.
-        prng_update_o = (SBoxImpl == SBoxImplDom) ? (advance_chk == SP2V_LOW) :
-            Masking ? (out_valid_o == SP2V_HIGH && out_ready == SP2V_HIGH)    : 1'b0;
-        if (out_valid_o == SP2V_HIGH && out_ready == SP2V_HIGH) begin
-          sub_bytes_out_ack_o = (dec_key_gen_q == SP2V_LOW) ? SP2V_HIGH : SP2V_LOW;
-
-          // Clear the state.
-          state_we_o          = SP2V_HIGH;
-          crypt_d             = SP2V_LOW;
-          // If we were generating the decryption key and didn't get the handshake in the last
-          // regular round, we should clear dec_key_gen now.
-          dec_key_gen_d       = SP2V_LOW;
-          aes_cipher_ctrl_ns  = IDLE;
-        end
-      end
-
-      CLEAR_S: begin
-        // Clear the state with pseudo-random data.
-        state_we_o         = SP2V_HIGH;
-        state_sel_o        = STATE_CLEAR;
-        aes_cipher_ctrl_ns = CLEAR_KD;
-      end
-
-      CLEAR_KD: begin
-        // Clear internal key registers and/or external data output registers.
-        if (key_clear_q) begin
-          key_full_sel_o = KEY_FULL_CLEAR;
-          key_full_we_o  = SP2V_HIGH;
-          key_dec_sel_o  = KEY_DEC_CLEAR;
-          key_dec_we_o   = SP2V_HIGH;
-        end
-        if (data_out_clear_q) begin
-          // Forward the state (previously cleared with psuedo-random data).
-          add_rk_sel_o    = ADD_RK_INIT;
-          key_words_sel_o = KEY_WORDS_ZERO;
-          round_key_sel_o = ROUND_KEY_DIRECT;
-        end
-        // Indicate that we are done, wait for handshake.
-        out_valid_o = SP2V_HIGH;
-        if (out_ready == SP2V_HIGH) begin
-          key_clear_d        = 1'b0;
-          data_out_clear_d   = 1'b0;
-          aes_cipher_ctrl_ns = IDLE;
-        end
-      end
-
-      ERROR: begin
-        // Terminal error state
-        alert_o = 1'b1;
-      end
-
-      // We should never get here. If we do (e.g. via a malicious glitch), error out immediately.
-      default: begin
-        aes_cipher_ctrl_ns = ERROR;
-      end
-    endcase
-
-    // Unconditionally jump into the terminal error state in case a mux selector or a sparsely
-    // encoded signal becomes invalid, or in case we have detected a fault in the round counter.
-    if (mux_sel_err_i || sp_enc_err || rnd_ctr_err) begin
-      aes_cipher_ctrl_ns = ERROR;
+        .rnd_ctr_q_i           ( rnd_ctr_q                ),
+        .rnd_ctr_d_o           ( mr_rnd_ctr_d[i]          ), // OR-combine
+        .rnd_ctr_rem_q_i       ( rnd_ctr_rem_q            ),
+        .rnd_ctr_rem_d_o       ( mr_rnd_ctr_rem_d[i]      ), // OR-combine
+        .num_rounds_q_i        ( num_rounds_q             ),
+        .num_rounds_d_o        ( mr_num_rounds_d[i]       ), // OR-combine
+        .crypt_q_ni            ( crypt_q[i]               ), // Sparsified
+        .crypt_d_no            ( crypt_d[i]               ), // Sparsified
+        .dec_key_gen_q_ni      ( dec_key_gen_q[i]         ), // Sparsified
+        .dec_key_gen_d_no      ( dec_key_gen_d[i]         ), // Sparsified
+        .key_clear_q_i         ( key_clear_q              ),
+        .key_clear_d_o         ( mr_key_clear_d[i]        ), // AND-combine
+        .data_out_clear_q_i    ( data_out_clear_q         ),
+        .data_out_clear_d_o    ( mr_data_out_clear_d[i]   )  // AND-combine
+      );
     end
   end
 
-  // This primitive is used to place a size-only constraint on the
-  // flops in order to prevent FSM state encoding optimizations.
-  logic [StateWidth-1:0] aes_cipher_ctrl_cs_raw;
-  assign aes_cipher_ctrl_cs = aes_cipher_ctrl_e'(aes_cipher_ctrl_cs_raw);
-  prim_flop #(
-    .Width(StateWidth),
-    .ResetValue(StateWidth'(IDLE))
-  ) u_state_regs (
-    .clk_i,
-    .rst_ni,
-    .d_i ( aes_cipher_ctrl_ns     ),
-    .q_o ( aes_cipher_ctrl_cs_raw )
-  );
+  // Combine single-bit FSM outputs.
+  // OR: One bit is sufficient to drive the corresponding output bit high.
+  assign alert_o            = |mr_alert;
+  assign prng_update_o      = |mr_prng_update;
+  assign prng_reseed_req_o  = |mr_prng_reseed_req;
+  assign key_expand_clear_o = |mr_key_expand_clear;
+  // AND: Only if all bits are high, the corresponding status is signaled which will lead to
+  // the clearing of these trigger bits.
+  assign key_clear_d        = &mr_key_clear_d;
+  assign data_out_clear_d   = &mr_data_out_clear_d;
+
+  // Combine multi-bit, sparse FSM outputs. We simply OR them together. If the FSMs don't provide
+  // the same outputs, two cases are possible:
+  // - An invalid encoding results: A downstream checker will fire, see mux_sel_err_i.
+  // - A valid encoding results: The outputs are compared below to cover this case, see mr_err;
+  always_comb begin : combine_sparse_signals
+    state_sel_o     = state_sel_e'('0);
+    add_rk_sel_o    = add_rk_sel_e'('0);
+    key_full_sel_o  = key_full_sel_e'('0);
+    key_dec_sel_o   = key_dec_sel_e'('0);
+    key_words_sel_o = key_words_sel_e'('0);
+    round_key_sel_o = round_key_sel_e'('0);
+    mr_err          = 1'b0;
+
+    for (int i = 0; i < Sp2VWidth; i++) begin
+      state_sel_o     = state_sel_e'({state_sel_o}         | {mr_state_sel[i]});
+      add_rk_sel_o    = add_rk_sel_e'({add_rk_sel_o}       | {mr_add_rk_sel[i]});
+      key_full_sel_o  = key_full_sel_e'({key_full_sel_o}   | {mr_key_full_sel[i]});
+      key_dec_sel_o   = key_dec_sel_e'({key_dec_sel_o}     | {mr_key_dec_sel[i]});
+      key_words_sel_o = key_words_sel_e'({key_words_sel_o} | {mr_key_words_sel[i]});
+      round_key_sel_o = round_key_sel_e'({round_key_sel_o} | {mr_round_key_sel[i]});
+
+      if (state_sel_o     != mr_state_sel[i]     ||
+          add_rk_sel_o    != mr_add_rk_sel[i]    ||
+          key_full_sel_o  != mr_key_full_sel[i]  ||
+          key_dec_sel_o   != mr_key_dec_sel[i]   ||
+          key_words_sel_o != mr_key_words_sel[i] ||
+          round_key_sel_o != mr_round_key_sel[i]) begin
+        mr_err = 1'b1;
+      end
+    end
+  end
+
+  // Collect errors in mux selector signals.
+  assign mux_sel_err = mux_sel_err_i | mr_err;
+
+  // Combine counter signals. We simply OR them together. If the FSMs don't provide the same
+  // outputs, this will be detected by the round counter protection logic below.
+  always_comb begin : combine_counter_signals
+    rnd_ctr_d     = '0;
+    rnd_ctr_rem_d = '0;
+    num_rounds_d  = '0;
+    for (int i = 0; i < Sp2VWidth; i++) begin
+      rnd_ctr_d     |= mr_rnd_ctr_d[i];
+      rnd_ctr_rem_d |= mr_rnd_ctr_rem_d[i];
+      num_rounds_d  |= mr_num_rounds_d[i];
+    end
+  end
 
   always_ff @(posedge clk_i or negedge rst_ni) begin : reg_fsm
     if (!rst_ni) begin
-      num_rounds_q       <= '0;
       key_clear_q        <= 1'b0;
       data_out_clear_q   <= 1'b0;
-      prng_reseed_done_q <= 1'b0;
     end else begin
-      num_rounds_q       <= num_rounds_d;
       key_clear_q        <= key_clear_d;
       data_out_clear_q   <= data_out_clear_d;
-      prng_reseed_done_q <= prng_reseed_done_d;
     end
   end
 
-  // Use separate signal for number of regular rounds.
-  assign num_rounds_regular = num_rounds_q - 4'd1;
-
   // Use separate signal for key expand operation, forward round.
   assign key_expand_op_o    = (dec_key_gen_d == SP2V_HIGH ||
                                dec_key_gen_q == SP2V_HIGH) ? CIPH_FWD : op_i;
@@ -499,6 +359,16 @@
   );
 
   prim_flop #(
+    .Width(4),
+    .ResetValue('0)
+  ) u_num_rounds_regs (
+    .clk_i,
+    .rst_ni,
+    .d_i ( num_rounds_d ),
+    .q_o ( num_rounds_q )
+  );
+
+  prim_flop #(
     .Width(1),
     .ResetValue('0)
   ) u_rnd_ctr_par_reg (
@@ -559,7 +429,7 @@
   );
 
   // We use vectors of sparsely encoded signals to reduce code duplication.
-  localparam int unsigned NumSp2VSig = 9;
+  localparam int unsigned NumSp2VSig = 8;
   sp2v_e [NumSp2VSig-1:0]                sp2v_sig;
   sp2v_e [NumSp2VSig-1:0]                sp2v_sig_chk;
   logic  [NumSp2VSig-1:0][Sp2VWidth-1:0] sp2v_sig_chk_raw;
@@ -571,9 +441,8 @@
   assign sp2v_sig[3] = dec_key_gen_i;
   assign sp2v_sig[4] = sp2v_e'(crypt_q_raw);
   assign sp2v_sig[5] = sp2v_e'(dec_key_gen_q_raw);
-  assign sp2v_sig[6] = advance;
-  assign sp2v_sig[7] = sub_bytes_out_req_i;
-  assign sp2v_sig[8] = key_expand_out_req_i;
+  assign sp2v_sig[6] = sub_bytes_out_req_i;
+  assign sp2v_sig[7] = key_expand_out_req_i;
 
   // Individually check sparsely encoded signals.
   for (genvar i = 0; i < NumSp2VSig; i++) begin : gen_sel_buf_chk
@@ -596,9 +465,8 @@
   assign dec_key_gen        = sp2v_sig_chk[3];
   assign crypt_q            = sp2v_sig_chk[4];
   assign dec_key_gen_q      = sp2v_sig_chk[5];
-  assign advance_chk        = sp2v_sig_chk[6];
-  assign sub_bytes_out_req  = sp2v_sig_chk[7];
-  assign key_expand_out_req = sp2v_sig_chk[8];
+  assign sub_bytes_out_req  = sp2v_sig_chk[6];
+  assign key_expand_out_req = sp2v_sig_chk[7];
 
   // Collect encoding errors.
   // We instantiate the checker modules as close as possible to where the sparsely encoded signals
@@ -611,18 +479,5 @@
 
   // Selectors must be known/valid
   `ASSERT_KNOWN(AesCiphOpKnown, op_i)
-  `ASSERT(AesKeyLenValid, cfg_valid_i |-> key_len_i inside {
-      AES_128,
-      AES_192,
-      AES_256
-      })
-  `ASSERT(AesControlStateValid, !alert_o |-> aes_cipher_ctrl_cs inside {
-      IDLE,
-      INIT,
-      ROUND,
-      FINISH,
-      CLEAR_S,
-      CLEAR_KD
-      })
 
 endmodule
diff --git a/hw/ip/aes/rtl/aes_cipher_control_fsm.sv b/hw/ip/aes/rtl/aes_cipher_control_fsm.sv
new file mode 100644
index 0000000..d5d4d28
--- /dev/null
+++ b/hw/ip/aes/rtl/aes_cipher_control_fsm.sv
@@ -0,0 +1,459 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// AES cipher core control FSM
+//
+// This module contains the AES cipher core control FSM.
+
+`include "prim_assert.sv"
+
+module aes_cipher_control_fsm import aes_pkg::*;
+#(
+  parameter bit         Masking  = 0,
+  parameter sbox_impl_e SBoxImpl = SBoxImplLut
+) (
+  input  logic             clk_i,
+  input  logic             rst_ni,
+
+  // Input handshake signals
+  input  logic             in_valid_i,           // Sparsify using multi-rail.
+  output logic             in_ready_o,           // Sparsify using multi-rail.
+
+  // Output handshake signals
+  output logic             out_valid_o,          // Sparsify using multi-rail.
+  input  logic             out_ready_i,          // Sparsify using multi-rail.
+
+  // Control and sync signals
+  input  logic             cfg_valid_i,          // Used for SVAs only.
+  input  ciph_op_e         op_i,
+  input  key_len_e         key_len_i,
+  input  logic             crypt_i,              // Sparsify using multi-rail.
+  input  logic             dec_key_gen_i,        // Sparsify using multi-rail.
+  input  logic             key_clear_i,
+  input  logic             data_out_clear_i,
+  input  logic             mux_sel_err_i,
+  input  logic             sp_enc_err_i,
+  output logic             alert_o,
+
+  // Control signals for masking PRNG
+  output logic             prng_update_o,
+  output logic             prng_reseed_req_o,
+  input  logic             prng_reseed_ack_i,
+
+  // Control and sync signals for cipher data path
+  output state_sel_e       state_sel_o,
+  output logic             state_we_o,           // Sparsify using multi-rail.
+  output logic             sub_bytes_en_o,       // Sparsify using multi-rail.
+  input  logic             sub_bytes_out_req_i,  // Sparsify using multi-rail.
+  output logic             sub_bytes_out_ack_o,  // Sparsify using multi-rail.
+  output add_rk_sel_e      add_rk_sel_o,
+
+  // Control and sync signals for key expand data path
+  output key_full_sel_e    key_full_sel_o,
+  output logic             key_full_we_o,        // Sparsify using multi-rail.
+  output key_dec_sel_e     key_dec_sel_o,
+  output logic             key_dec_we_o,         // Sparsify using multi-rail.
+  output logic             key_expand_en_o,      // Sparsify using multi-rail.
+  input  logic             key_expand_out_req_i, // Sparsify using multi-rail.
+  output logic             key_expand_out_ack_o, // Sparsify using multi-rail.
+  output logic             key_expand_clear_o,
+  output key_words_sel_e   key_words_sel_o,
+  output round_key_sel_e   round_key_sel_o,
+
+  // Register signals
+  input  logic [3:0]       rnd_ctr_q_i,
+  output logic [3:0]       rnd_ctr_d_o,
+  input  logic [3:0]       rnd_ctr_rem_q_i,
+  output logic [3:0]       rnd_ctr_rem_d_o,
+  input  logic [3:0]       num_rounds_q_i,
+  output logic [3:0]       num_rounds_d_o,
+  input  logic             rnd_ctr_err_i,
+  input  logic             crypt_q_i,            // Sparsify using multi-rail.
+  output logic             crypt_d_o,            // Sparsify using multi-rail.
+  input  logic             dec_key_gen_q_i,      // Sparsify using multi-rail.
+  output logic             dec_key_gen_d_o,      // Sparsify using multi-rail.
+  input  logic             key_clear_q_i,
+  output logic             key_clear_d_o,
+  input  logic             data_out_clear_q_i,
+  output logic             data_out_clear_d_o
+);
+
+  // Types
+  // $ ./sparse-fsm-encode.py -d 3 -m 7 -n 6 \
+  //      -s 31468618 --language=sv
+  //
+  // Hamming distance histogram:
+  //
+  //  0: --
+  //  1: --
+  //  2: --
+  //  3: |||||||||||||||||||| (57.14%)
+  //  4: ||||||||||||||| (42.86%)
+  //  5: --
+  //  6: --
+  //
+  // Minimum Hamming distance: 3
+  // Maximum Hamming distance: 4
+  //
+  localparam int StateWidth = 6;
+  typedef enum logic [StateWidth-1:0] {
+    IDLE     = 6'b111100,
+    INIT     = 6'b101001,
+    ROUND    = 6'b010000,
+    FINISH   = 6'b100010,
+    CLEAR_S  = 6'b011011,
+    CLEAR_KD = 6'b110111,
+    ERROR    = 6'b001110
+  } aes_cipher_ctrl_e;
+
+  // cfg_valid_i is used for SVAs only.
+  logic unused_cfg_valid;
+  assign unused_cfg_valid = cfg_valid_i;
+
+  // Signals
+  aes_cipher_ctrl_e aes_cipher_ctrl_ns, aes_cipher_ctrl_cs;
+  logic             advance;
+  logic             prng_reseed_done_d, prng_reseed_done_q;
+  logic       [3:0] num_rounds_regular;
+
+  // Use separate signal for number of regular rounds.
+  assign num_rounds_regular = num_rounds_q_i - 4'd1;
+
+  // FSM
+  always_comb begin : aes_cipher_ctrl_fsm
+
+    // Handshake signals
+    in_ready_o           = 1'b0;
+    out_valid_o          = 1'b0;
+
+    // Masking PRNG signals
+    prng_update_o        = 1'b0;
+    prng_reseed_req_o    = 1'b0;
+
+    // Cipher data path
+    state_sel_o          = STATE_ROUND;
+    state_we_o           = 1'b0;
+    add_rk_sel_o         = ADD_RK_ROUND;
+    sub_bytes_en_o       = 1'b0;
+    sub_bytes_out_ack_o  = 1'b0;
+
+    // Key expand data path
+    key_full_sel_o       = KEY_FULL_ROUND;
+    key_full_we_o        = 1'b0;
+    key_dec_sel_o        = KEY_DEC_EXPAND;
+    key_dec_we_o         = 1'b0;
+    key_expand_en_o      = 1'b0;
+    key_expand_out_ack_o = 1'b0;
+    key_expand_clear_o   = 1'b0;
+    key_words_sel_o      = KEY_WORDS_ZERO;
+    round_key_sel_o      = ROUND_KEY_DIRECT;
+
+    // FSM
+    aes_cipher_ctrl_ns   = aes_cipher_ctrl_cs;
+    num_rounds_d_o       = num_rounds_q_i;
+    rnd_ctr_d_o          = rnd_ctr_q_i;
+    rnd_ctr_rem_d_o      = rnd_ctr_rem_q_i;
+    crypt_d_o            = crypt_q_i;
+    dec_key_gen_d_o      = dec_key_gen_q_i;
+    key_clear_d_o        = key_clear_q_i;
+    data_out_clear_d_o   = data_out_clear_q_i;
+    prng_reseed_done_d   = prng_reseed_done_q | prng_reseed_ack_i;
+    advance              = 1'b0;
+
+    // Alert
+    alert_o              = 1'b0;
+
+    unique case (aes_cipher_ctrl_cs)
+
+      IDLE: begin
+        dec_key_gen_d_o = 1'b0;
+
+        // Signal that we are ready, wait for handshake.
+        in_ready_o = 1'b1;
+        if (in_valid_i) begin
+          if (key_clear_i || data_out_clear_i) begin
+            // Clear internal key registers. The cipher core muxes are used to clear the data
+            // output registers.
+            key_clear_d_o      = key_clear_i;
+            data_out_clear_d_o = data_out_clear_i;
+
+            // To clear the data output registers, we must first clear the state.
+            aes_cipher_ctrl_ns = data_out_clear_i ? CLEAR_S : CLEAR_KD;
+
+          end else if (dec_key_gen_i || crypt_i) begin
+            // Start encryption/decryption or generation of start key for decryption.
+            crypt_d_o       = ~dec_key_gen_i & crypt_i;
+            dec_key_gen_d_o =  dec_key_gen_i;
+
+            // Load input data to state
+            state_sel_o = dec_key_gen_i ? STATE_CLEAR : STATE_INIT;
+            state_we_o  = 1'b1;
+
+            // Make the masking PRNG advance. The current pseudo-random data is used to mask the
+            // input data.
+            prng_update_o = dec_key_gen_i ? 1'b0 : Masking;
+
+            // Init key expand
+            key_expand_clear_o = 1'b1;
+
+            // Load full key
+            key_full_sel_o = dec_key_gen_i ? KEY_FULL_ENC_INIT :
+                        (op_i == CIPH_FWD) ? KEY_FULL_ENC_INIT :
+                                             KEY_FULL_DEC_INIT;
+            key_full_we_o  = 1'b1;
+
+            // Load num_rounds, initialize round counters.
+            num_rounds_d_o = (key_len_i == AES_128) ? 4'd10 :
+                             (key_len_i == AES_192) ? 4'd12 :
+                                                      4'd14;
+            rnd_ctr_rem_d_o    = num_rounds_d_o;
+            rnd_ctr_d_o        = '0;
+            aes_cipher_ctrl_ns = INIT;
+
+          end else begin
+            // Handshake without a valid command. We should never get here. If we do (e.g. via a
+            // malicious glitch), error out immediately.
+            aes_cipher_ctrl_ns = ERROR;
+          end
+        end
+      end
+
+      INIT: begin
+        // Initial round: just add key to state
+        add_rk_sel_o = ADD_RK_INIT;
+
+        // Select key words for initial add_round_key
+        key_words_sel_o = (dec_key_gen_q_i)            ? KEY_WORDS_ZERO :
+            (key_len_i == AES_128)                     ? KEY_WORDS_0123 :
+            (key_len_i == AES_192 && op_i == CIPH_FWD) ? KEY_WORDS_0123 :
+            (key_len_i == AES_192 && op_i == CIPH_INV) ? KEY_WORDS_2345 :
+            (key_len_i == AES_256 && op_i == CIPH_FWD) ? KEY_WORDS_0123 :
+            (key_len_i == AES_256 && op_i == CIPH_INV) ? KEY_WORDS_4567 : KEY_WORDS_ZERO;
+
+        // Clear masking PRNG reseed status.
+        prng_reseed_done_d = 1'b0;
+
+        // AES-256 has two round keys available right from beginning. Pseudo-random data is
+        // required by KeyExpand only.
+        if (key_len_i != AES_256) begin
+          // Advance in sync with KeyExpand. Based on the S-Box implementation, it can take
+          // multiple cycles to finish. Wait for handshake. The DOM S-Boxes take fresh PRD
+          // in every cycle except the last.
+          advance         = key_expand_out_req_i;
+          prng_update_o   = (SBoxImpl == SBoxImplDom) ? advance : Masking;
+          key_expand_en_o = 1'b1;
+          if (advance) begin
+            key_expand_out_ack_o = 1'b1;
+            state_we_o           = ~dec_key_gen_q_i;
+            key_full_we_o        = 1'b1;
+            rnd_ctr_d_o          = rnd_ctr_q_i     + 4'b0001;
+            rnd_ctr_rem_d_o      = rnd_ctr_rem_q_i - 4'b0001;
+            aes_cipher_ctrl_ns   = ROUND;
+          end
+        end else begin
+          state_we_o         = ~dec_key_gen_q_i;
+          rnd_ctr_d_o        = rnd_ctr_q_i     + 4'b0001;
+          rnd_ctr_rem_d_o    = rnd_ctr_rem_q_i - 4'b0001;
+          aes_cipher_ctrl_ns = ROUND;
+        end
+      end
+
+      ROUND: begin
+        // Normal rounds
+
+        // Select key words for add_round_key
+        key_words_sel_o = (dec_key_gen_q_i)            ? KEY_WORDS_ZERO :
+            (key_len_i == AES_128)                     ? KEY_WORDS_0123 :
+            (key_len_i == AES_192 && op_i == CIPH_FWD) ? KEY_WORDS_2345 :
+            (key_len_i == AES_192 && op_i == CIPH_INV) ? KEY_WORDS_0123 :
+            (key_len_i == AES_256 && op_i == CIPH_FWD) ? KEY_WORDS_4567 :
+            (key_len_i == AES_256 && op_i == CIPH_INV) ? KEY_WORDS_0123 : KEY_WORDS_ZERO;
+
+        // Keep requesting PRNG reseeding until it is acknowledged.
+        prng_reseed_req_o = Masking & ~prng_reseed_done_q;
+
+        // Select round key: direct or mixed (equivalent inverse cipher)
+        round_key_sel_o = (op_i == CIPH_FWD) ? ROUND_KEY_DIRECT : ROUND_KEY_MIXED;
+
+        // Advance in sync with SubBytes and KeyExpand. Based on the S-Box implementation, both can
+        // take multiple cycles to finish. Wait for handshake. Make the masking PRNG advance every
+        // cycle. The DOM S-Boxes take fresh PRD in every cycle except the last.
+        advance         = (dec_key_gen_q_i | sub_bytes_out_req_i) & key_expand_out_req_i;
+        prng_update_o   = (SBoxImpl == SBoxImplDom) ? ~advance : Masking;
+        sub_bytes_en_o  = ~dec_key_gen_q_i;
+        key_expand_en_o = 1'b1;
+        if (advance) begin
+          sub_bytes_out_ack_o  = ~dec_key_gen_q_i;
+          key_expand_out_ack_o = 1'b1;
+
+          state_we_o    = ~dec_key_gen_q_i;
+          key_full_we_o = 1'b1;
+
+          // Update round
+          rnd_ctr_d_o     = rnd_ctr_q_i     + 4'b0001;
+          rnd_ctr_rem_d_o = rnd_ctr_rem_q_i - 4'b0001;
+
+          // Are we doing the last regular round?
+          if (rnd_ctr_q_i == num_rounds_regular) begin
+            aes_cipher_ctrl_ns = FINISH;
+
+            if (dec_key_gen_q_i) begin
+              // Write decryption key.
+              key_dec_we_o = 1'b1;
+
+              // Indicate that we are done, try to perform the handshake. But we don't wait here.
+              // If we don't get the handshake now, we will wait in the finish state. When using
+              // masking, we only finish if the masking PRNG has been reseeded.
+              out_valid_o = Masking ? prng_reseed_done_q : 1'b1;
+              if (out_valid_o && out_ready_i) begin
+                // Go to idle state directly.
+                dec_key_gen_d_o    = 1'b0;
+                aes_cipher_ctrl_ns = IDLE;
+              end
+            end
+          end // rnd_ctr_q_i
+        end // SubBytes/KeyExpand REQ/ACK
+      end
+
+      FINISH: begin
+        // Final round
+
+        // Select key words for add_round_key
+        key_words_sel_o = (dec_key_gen_q_i)            ? KEY_WORDS_ZERO :
+            (key_len_i == AES_128)                     ? KEY_WORDS_0123 :
+            (key_len_i == AES_192 && op_i == CIPH_FWD) ? KEY_WORDS_2345 :
+            (key_len_i == AES_192 && op_i == CIPH_INV) ? KEY_WORDS_0123 :
+            (key_len_i == AES_256 && op_i == CIPH_FWD) ? KEY_WORDS_4567 :
+            (key_len_i == AES_256 && op_i == CIPH_INV) ? KEY_WORDS_0123 : KEY_WORDS_ZERO;
+
+        // Skip mix_columns
+        add_rk_sel_o = ADD_RK_FINAL;
+
+        // Keep requesting PRNG reseeding until it is acknowledged.
+        prng_reseed_req_o = Masking & ~prng_reseed_done_q;
+
+        // Once we're done, we won't need the state anymore. We actually clear it when progressing
+        // to the next state.
+        state_sel_o = STATE_CLEAR;
+
+        // Advance in sync with SubBytes. Based on the S-Box implementation, it can take multiple
+        // cycles to finish. Only indicate that we are done if:
+        // - we have valid output (SubBytes finished),
+        // - the masking PRNG has been reseeded (if masking is used),
+        // - all mux selector signals are valid (don't release data in case of errors), and
+        // - all sparsely encoded signals are valid (don't release data in case of errors).
+        // Perform both handshakes simultaneously.
+        advance        = sub_bytes_out_req_i | dec_key_gen_q_i;
+        sub_bytes_en_o = ~dec_key_gen_q_i;
+        out_valid_o    = (mux_sel_err_i || sp_enc_err_i) ? 1'b0 :
+                         Masking ? prng_reseed_done_q & advance : advance;
+        // When using DOM S-Boxes, make the masking PRNG advance every cycle until the output is
+        // ready. For other S-Boxes, make it advance once only. Updating it while being stalled
+        // would cause non-DOM S-Boxes to be re-evaluated, thereby creating additional SCA leakage.
+        prng_update_o = (SBoxImpl == SBoxImplDom) ? ~advance                  :
+                                          Masking ? out_valid_o & out_ready_i : 1'b0;
+        if (out_valid_o && out_ready_i) begin
+          sub_bytes_out_ack_o = ~dec_key_gen_q_i;
+
+          // Clear the state.
+          state_we_o          = 1'b1;
+          crypt_d_o           = 1'b0;
+          // If we were generating the decryption key and didn't get the handshake in the last
+          // regular round, we should clear dec_key_gen now.
+          dec_key_gen_d_o     = 1'b0;
+          aes_cipher_ctrl_ns  = IDLE;
+        end
+      end
+
+      CLEAR_S: begin
+        // Clear the state with pseudo-random data.
+        state_we_o         = 1'b1;
+        state_sel_o        = STATE_CLEAR;
+        aes_cipher_ctrl_ns = CLEAR_KD;
+      end
+
+      CLEAR_KD: begin
+        // Clear internal key registers and/or external data output registers.
+        if (key_clear_q_i) begin
+          key_full_sel_o = KEY_FULL_CLEAR;
+          key_full_we_o  = 1'b1;
+          key_dec_sel_o  = KEY_DEC_CLEAR;
+          key_dec_we_o   = 1'b1;
+        end
+        if (data_out_clear_q_i) begin
+          // Forward the state (previously cleared with psuedo-random data).
+          add_rk_sel_o    = ADD_RK_INIT;
+          key_words_sel_o = KEY_WORDS_ZERO;
+          round_key_sel_o = ROUND_KEY_DIRECT;
+        end
+        // Indicate that we are done, wait for handshake.
+        out_valid_o = 1'b1;
+        if (out_ready_i) begin
+          key_clear_d_o      = 1'b0;
+          data_out_clear_d_o = 1'b0;
+          aes_cipher_ctrl_ns = IDLE;
+        end
+      end
+
+      ERROR: begin
+        // Terminal error state
+        alert_o = 1'b1;
+      end
+
+      // We should never get here. If we do (e.g. via a malicious glitch), error out immediately.
+      default: begin
+        aes_cipher_ctrl_ns = ERROR;
+      end
+    endcase
+
+    // Unconditionally jump into the terminal error state in case a mux selector or a sparsely
+    // encoded signal becomes invalid, or in case we have detected a fault in the round counter.
+    if (mux_sel_err_i || sp_enc_err_i || rnd_ctr_err_i) begin
+      aes_cipher_ctrl_ns = ERROR;
+    end
+  end
+
+  // This primitive is used to place a size-only constraint on the
+  // flops in order to prevent FSM state encoding optimizations.
+  logic [StateWidth-1:0] aes_cipher_ctrl_cs_raw;
+  assign aes_cipher_ctrl_cs = aes_cipher_ctrl_e'(aes_cipher_ctrl_cs_raw);
+  prim_flop #(
+    .Width(StateWidth),
+    .ResetValue(StateWidth'(IDLE))
+  ) u_state_regs (
+    .clk_i,
+    .rst_ni,
+    .d_i ( aes_cipher_ctrl_ns     ),
+    .q_o ( aes_cipher_ctrl_cs_raw )
+  );
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin : reg_fsm
+    if (!rst_ni) begin
+      prng_reseed_done_q <= 1'b0;
+    end else begin
+      prng_reseed_done_q <= prng_reseed_done_d;
+    end
+  end
+
+  ////////////////
+  // Assertions //
+  ////////////////
+
+  // Selectors must be known/valid
+  `ASSERT_KNOWN(AesCiphOpKnown, op_i)
+  `ASSERT(AesKeyLenValid, cfg_valid_i |-> key_len_i inside {
+      AES_128,
+      AES_192,
+      AES_256
+      })
+  `ASSERT(AesControlStateValid, !alert_o |-> aes_cipher_ctrl_cs inside {
+      IDLE,
+      INIT,
+      ROUND,
+      FINISH,
+      CLEAR_S,
+      CLEAR_KD
+      })
+
+endmodule
diff --git a/hw/ip/aes/rtl/aes_cipher_control_fsm_n.sv b/hw/ip/aes/rtl/aes_cipher_control_fsm_n.sv
new file mode 100644
index 0000000..3e71fe2
--- /dev/null
+++ b/hw/ip/aes/rtl/aes_cipher_control_fsm_n.sv
@@ -0,0 +1,397 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// AES cipher core control FSM
+//
+// This module contains the AES cipher core control FSM operating on and producing the negated
+// values of important control signals. This is achieved by:
+// - instantiating the regular AES cipher core control FSM operating on and producing the positive
+//   values of these signals, and
+// - inverting these signals between the regular FSM and the prim_buf synthesis barriers.
+// Synthesis tools will then push the inverters into the actual FSM.
+
+module aes_cipher_control_fsm_n import aes_pkg::*;
+#(
+  parameter bit         Masking  = 0,
+  parameter sbox_impl_e SBoxImpl = SBoxImplLut
+) (
+  input  logic             clk_i,
+  input  logic             rst_ni,
+
+  // Input handshake signals
+  input  logic             in_valid_ni,           // Sparsify using multi-rail.
+  output logic             in_ready_no,           // Sparsify using multi-rail.
+
+  // Output handshake signals
+  output logic             out_valid_no,          // Sparsify using multi-rail.
+  input  logic             out_ready_ni,          // Sparsify using multi-rail.
+
+  // Control and sync signals
+  input  logic             cfg_valid_i,           // Used for SVAs only.
+  input  ciph_op_e         op_i,
+  input  key_len_e         key_len_i,
+  input  logic             crypt_ni,              // Sparsify using multi-rail.
+  input  logic             dec_key_gen_ni,        // Sparsify using multi-rail.
+  input  logic             key_clear_i,
+  input  logic             data_out_clear_i,
+  input  logic             mux_sel_err_i,
+  input  logic             sp_enc_err_i,
+  input  logic             rnd_ctr_err_i,
+  output logic             alert_o,
+
+  // Control signals for masking PRNG
+  output logic             prng_update_o,
+  output logic             prng_reseed_req_o,
+  input  logic             prng_reseed_ack_i,
+
+  // Control and sync signals for cipher data path
+  output state_sel_e       state_sel_o,
+  output logic             state_we_no,           // Sparsify using multi-rail.
+  output logic             sub_bytes_en_no,       // Sparsify using multi-rail.
+  input  logic             sub_bytes_out_req_ni,  // Sparsify using multi-rail.
+  output logic             sub_bytes_out_ack_no,  // Sparsify using multi-rail.
+  output add_rk_sel_e      add_rk_sel_o,
+
+  // Control and sync signals for key expand data path
+  output key_full_sel_e    key_full_sel_o,
+  output logic             key_full_we_no,        // Sparsify using multi-rail.
+  output key_dec_sel_e     key_dec_sel_o,
+  output logic             key_dec_we_no,         // Sparsify using multi-rail.
+  output logic             key_expand_en_no,      // Sparsify using multi-rail.
+  input  logic             key_expand_out_req_ni, // Sparsify using multi-rail.
+  output logic             key_expand_out_ack_no, // Sparsify using multi-rail.
+  output logic             key_expand_clear_o,
+  output key_words_sel_e   key_words_sel_o,
+  output round_key_sel_e   round_key_sel_o,
+
+  // Register signals
+  input  logic [3:0]       rnd_ctr_q_i,
+  output logic [3:0]       rnd_ctr_d_o,
+  input  logic [3:0]       rnd_ctr_rem_q_i,
+  output logic [3:0]       rnd_ctr_rem_d_o,
+  input  logic [3:0]       num_rounds_q_i,
+  output logic [3:0]       num_rounds_d_o,
+  input  logic             crypt_q_ni,            // Sparsify using multi-rail.
+  output logic             crypt_d_no,            // Sparsify using multi-rail.
+  input  logic             dec_key_gen_q_ni,      // Sparsify using multi-rail.
+  output logic             dec_key_gen_d_no,      // Sparsify using multi-rail.
+  input  logic             key_clear_q_i,
+  output logic             key_clear_d_o,
+  input  logic             data_out_clear_q_i,
+  output logic             data_out_clear_d_o
+);
+
+  /////////////////////
+  // Input Buffering //
+  /////////////////////
+
+  localparam int NumInBufBits = $bits({
+    in_valid_ni,
+    out_ready_ni,
+    cfg_valid_i,
+    op_i,
+    key_len_i,
+    crypt_ni,
+    dec_key_gen_ni,
+    key_clear_i,
+    data_out_clear_i,
+    mux_sel_err_i,
+    sp_enc_err_i,
+    rnd_ctr_err_i,
+    prng_reseed_ack_i,
+    sub_bytes_out_req_ni,
+    key_expand_out_req_ni,
+    rnd_ctr_q_i,
+    rnd_ctr_rem_q_i,
+    num_rounds_q_i,
+    crypt_q_ni,
+    dec_key_gen_q_ni,
+    key_clear_q_i,
+    data_out_clear_q_i
+  });
+
+  logic [NumInBufBits-1:0] in, in_buf;
+
+  assign in = {
+    in_valid_ni,
+    out_ready_ni,
+    cfg_valid_i,
+    op_i,
+    key_len_i,
+    crypt_ni,
+    dec_key_gen_ni,
+    key_clear_i,
+    data_out_clear_i,
+    mux_sel_err_i,
+    sp_enc_err_i,
+    rnd_ctr_err_i,
+    prng_reseed_ack_i,
+    sub_bytes_out_req_ni,
+    key_expand_out_req_ni,
+    rnd_ctr_q_i,
+    rnd_ctr_rem_q_i,
+    num_rounds_q_i,
+    crypt_q_ni,
+    dec_key_gen_q_ni,
+    key_clear_q_i,
+    data_out_clear_q_i
+  };
+
+  // This primitive is used to place a size-only constraint on the
+  // buffers to act as a synthesis optimization barrier.
+  prim_buf #(
+    .Width(NumInBufBits)
+  ) u_prim_buf_in (
+    .in_i(in),
+    .out_o(in_buf)
+  );
+
+  logic             in_valid_n;
+  logic             out_ready_n;
+  logic             cfg_valid;
+  logic             op;
+  key_len_e         key_len;
+  logic             crypt_n;
+  logic             dec_key_gen_n;
+  logic             key_clear;
+  logic             data_out_clear;
+  logic             mux_sel_err;
+  logic             sp_enc_err;
+  logic             rnd_ctr_err;
+  logic             prng_reseed_ack;
+  logic             sub_bytes_out_req_n;
+  logic             key_expand_out_req_n;
+  logic [3:0]       rnd_ctr_q;
+  logic [3:0]       rnd_ctr_rem_q;
+  logic [3:0]       num_rounds_q;
+  logic             crypt_q_n;
+  logic             dec_key_gen_q_n;
+  logic             key_clear_q;
+  logic             data_out_clear_q;
+
+  assign {in_valid_n,
+          out_ready_n,
+          cfg_valid,
+          op,
+          key_len,
+          crypt_n,
+          dec_key_gen_n,
+          key_clear,
+          data_out_clear,
+          mux_sel_err,
+          sp_enc_err,
+          rnd_ctr_err,
+          prng_reseed_ack,
+          sub_bytes_out_req_n,
+          key_expand_out_req_n,
+          rnd_ctr_q,
+          rnd_ctr_rem_q,
+          num_rounds_q,
+          crypt_q_n,
+          dec_key_gen_q_n,
+          key_clear_q,
+          data_out_clear_q} = in_buf;
+
+  // Intermediate output signals
+  logic             in_ready;
+  logic             out_valid;
+  logic             alert;
+  logic             prng_update;
+  logic             prng_reseed_req;
+  state_sel_e       state_sel;
+  logic             state_we;
+  logic             sub_bytes_en;
+  logic             sub_bytes_out_ack;
+  add_rk_sel_e      add_rk_sel;
+  key_full_sel_e    key_full_sel;
+  logic             key_full_we;
+  key_dec_sel_e     key_dec_sel;
+  logic             key_dec_we;
+  logic             key_expand_en;
+  logic             key_expand_out_ack;
+  logic             key_expand_clear;
+  key_words_sel_e   key_words_sel;
+  round_key_sel_e   round_key_sel;
+  logic [3:0]       rnd_ctr_d;
+  logic [3:0]       rnd_ctr_rem_d;
+  logic [3:0]       num_rounds_d;
+  logic             crypt_d;
+  logic             dec_key_gen_d;
+  logic             key_clear_d;
+  logic             data_out_clear_d;
+
+  /////////////////
+  // Regular FSM //
+  /////////////////
+
+  // The regular FSM operates on and produces the positive values of important control signals.
+  // Invert *_n input signals here to get the positive values for the regular FSM. To obtain the
+  // negated outputs, important output signals are inverted further below. Thanks to the prim_buf
+  // synthesis optimization barriers, tools will push the inverters into the regular FSM.
+  aes_cipher_control_fsm #(
+    .Masking  ( Masking  ),
+    .SBoxImpl ( SBoxImpl )
+  ) u_aes_cipher_control_fsm (
+    .clk_i                 ( clk_i                 ),
+    .rst_ni                ( rst_ni                ),
+
+    .in_valid_i            ( ~in_valid_n           ), // Invert for regular FSM.
+    .in_ready_o            ( in_ready              ), // Invert below for negated output.
+
+    .out_valid_o           ( out_valid             ), // Invert below for negated output.
+    .out_ready_i           ( ~out_ready_n          ), // Invert for regular FSM.
+
+    .cfg_valid_i           ( cfg_valid             ),
+    .op_i                  ( op                    ),
+    .key_len_i             ( key_len               ),
+    .crypt_i               ( ~crypt_n              ), // Invert for regular FSM.
+    .dec_key_gen_i         ( ~dec_key_gen_n        ), // Invert for regular FSM.
+    .key_clear_i           ( key_clear             ),
+    .data_out_clear_i      ( data_out_clear        ),
+    .mux_sel_err_i         ( mux_sel_err           ),
+    .sp_enc_err_i          ( sp_enc_err            ),
+    .rnd_ctr_err_i         ( rnd_ctr_err           ),
+    .alert_o               ( alert                 ),
+
+    .prng_update_o         ( prng_update           ),
+    .prng_reseed_req_o     ( prng_reseed_req       ),
+    .prng_reseed_ack_i     ( prng_reseed_ack       ),
+
+    .state_sel_o           ( state_sel             ),
+    .state_we_o            ( state_we              ), // Invert below for negated output.
+    .sub_bytes_en_o        ( sub_bytes_en          ), // Invert below for negated output.
+    .sub_bytes_out_req_i   ( ~sub_bytes_out_req_n  ), // Invert for regular FSM.
+    .sub_bytes_out_ack_o   ( sub_bytes_out_ack     ), // Invert below for negated output.
+    .add_rk_sel_o          ( add_rk_sel            ),
+
+    .key_full_sel_o        ( key_full_sel          ),
+    .key_full_we_o         ( key_full_we           ), // Invert below for negated output.
+    .key_dec_sel_o         ( key_dec_sel           ),
+    .key_dec_we_o          ( key_dec_we            ), // Invert below for negated output.
+    .key_expand_en_o       ( key_expand_en         ), // Invert below for negated output.
+    .key_expand_out_req_i  ( ~key_expand_out_req_n ), // Invert for regular FSM.
+    .key_expand_out_ack_o  ( key_expand_out_ack    ), // Invert below for negated output.
+    .key_expand_clear_o    ( key_expand_clear      ),
+    .key_words_sel_o       ( key_words_sel         ),
+    .round_key_sel_o       ( round_key_sel         ),
+
+    .rnd_ctr_q_i           ( rnd_ctr_q             ),
+    .rnd_ctr_d_o           ( rnd_ctr_d             ),
+    .rnd_ctr_rem_q_i       ( rnd_ctr_rem_q         ),
+    .rnd_ctr_rem_d_o       ( rnd_ctr_rem_d         ),
+    .num_rounds_q_i        ( num_rounds_q          ),
+    .num_rounds_d_o        ( num_rounds_d          ),
+    .crypt_q_i             ( ~crypt_q_n            ), // Invert for regular FSM.
+    .crypt_d_o             ( crypt_d               ), // Invert below for negated output.
+    .dec_key_gen_q_i       ( ~dec_key_gen_q_n      ), // Invert for regular FSM.
+    .dec_key_gen_d_o       ( dec_key_gen_d         ), // Invert below for negated output.
+    .key_clear_q_i         ( key_clear_q           ),
+    .key_clear_d_o         ( key_clear_d           ),
+    .data_out_clear_q_i    ( data_out_clear_q      ),
+    .data_out_clear_d_o    ( data_out_clear_d      )
+  );
+
+  //////////////////////
+  // Output Buffering //
+  //////////////////////
+
+  localparam int NumOutBufBits = $bits({
+    in_ready_no,
+    out_valid_no,
+    alert_o,
+    prng_update_o,
+    prng_reseed_req_o,
+    state_sel_o,
+    state_we_no,
+    sub_bytes_en_no,
+    sub_bytes_out_ack_no,
+    add_rk_sel_o,
+    key_full_sel_o,
+    key_full_we_no,
+    key_dec_sel_o,
+    key_dec_we_no,
+    key_expand_en_no,
+    key_expand_out_ack_no,
+    key_expand_clear_o,
+    key_words_sel_o,
+    round_key_sel_o,
+    rnd_ctr_d_o,
+    rnd_ctr_rem_d_o,
+    num_rounds_d_o,
+    crypt_d_no,
+    dec_key_gen_d_no,
+    key_clear_d_o,
+    data_out_clear_d_o
+  });
+
+  logic [NumOutBufBits-1:0] out, out_buf;
+
+  // Important output control signals need to be inverted here. Synthesis tools will push the
+  // inverters back into the regular FSM.
+  assign out = {
+    ~in_ready,
+    ~out_valid,
+    alert,
+    prng_update,
+    prng_reseed_req,
+    state_sel,
+    ~state_we,
+    ~sub_bytes_en,
+    ~sub_bytes_out_ack,
+    add_rk_sel,
+    key_full_sel,
+    ~key_full_we,
+    key_dec_sel,
+    ~key_dec_we,
+    ~key_expand_en,
+    ~key_expand_out_ack,
+    key_expand_clear,
+    key_words_sel,
+    round_key_sel,
+    rnd_ctr_d,
+    rnd_ctr_rem_d,
+    num_rounds_d,
+    ~crypt_d,
+    ~dec_key_gen_d,
+    key_clear_d,
+    data_out_clear_d
+  };
+
+  // This primitive is used to place a size-only constraint on the
+  // buffers to act as a synthesis optimization barrier.
+  prim_buf #(
+    .Width(NumOutBufBits)
+  ) u_prim_buf_out (
+    .in_i(out),
+    .out_o(out_buf)
+  );
+
+  assign {in_ready_no,
+          out_valid_no,
+          alert_o,
+          prng_update_o,
+          prng_reseed_req_o,
+          state_sel_o,
+          state_we_no,
+          sub_bytes_en_no,
+          sub_bytes_out_ack_no,
+          add_rk_sel_o,
+          key_full_sel_o,
+          key_full_we_no,
+          key_dec_sel_o,
+          key_dec_we_no,
+          key_expand_en_no,
+          key_expand_out_ack_no,
+          key_expand_clear_o,
+          key_words_sel_o,
+          round_key_sel_o,
+          rnd_ctr_d_o,
+          rnd_ctr_rem_d_o,
+          num_rounds_d_o,
+          crypt_d_no,
+          dec_key_gen_d_no,
+          key_clear_d_o,
+          data_out_clear_d_o} = out_buf;
+
+endmodule
diff --git a/hw/ip/aes/rtl/aes_cipher_control_fsm_p.sv b/hw/ip/aes/rtl/aes_cipher_control_fsm_p.sv
new file mode 100644
index 0000000..654cbf8
--- /dev/null
+++ b/hw/ip/aes/rtl/aes_cipher_control_fsm_p.sv
@@ -0,0 +1,387 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// AES cipher core control FSM
+//
+// This module contains the AES cipher core control FSM operating on
+// and producing the positive values of important control signals.
+
+module aes_cipher_control_fsm_p import aes_pkg::*;
+#(
+  parameter bit         Masking  = 0,
+  parameter sbox_impl_e SBoxImpl = SBoxImplLut
+) (
+  input  logic             clk_i,
+  input  logic             rst_ni,
+
+  // Input handshake signals
+  input  logic             in_valid_i,            // Sparsify using multi-rail.
+  output logic             in_ready_o,            // Sparsify using multi-rail.
+
+  // Output handshake signals
+  output logic             out_valid_o,           // Sparsify using multi-rail.
+  input  logic             out_ready_i,           // Sparsify using multi-rail.
+
+  // Control and sync signals
+  input  logic             cfg_valid_i,           // Used for SVAs only.
+  input  ciph_op_e         op_i,
+  input  key_len_e         key_len_i,
+  input  logic             crypt_i,               // Sparsify using multi-rail.
+  input  logic             dec_key_gen_i,         // Sparsify using multi-rail.
+  input  logic             key_clear_i,
+  input  logic             data_out_clear_i,
+  input  logic             mux_sel_err_i,
+  input  logic             sp_enc_err_i,
+  input  logic             rnd_ctr_err_i,
+  output logic             alert_o,
+
+  // Control signals for masking PRNG
+  output logic             prng_update_o,
+  output logic             prng_reseed_req_o,
+  input  logic             prng_reseed_ack_i,
+
+  // Control and sync signals for cipher data path
+  output state_sel_e       state_sel_o,
+  output logic             state_we_o,            // Sparsify using multi-rail.
+  output logic             sub_bytes_en_o,        // Sparsify using multi-rail.
+  input  logic             sub_bytes_out_req_i,   // Sparsify using multi-rail.
+  output logic             sub_bytes_out_ack_o,   // Sparsify using multi-rail.
+  output add_rk_sel_e      add_rk_sel_o,
+
+  // Control and sync signals for key expand data path
+  output key_full_sel_e    key_full_sel_o,
+  output logic             key_full_we_o,         // Sparsify using multi-rail.
+  output key_dec_sel_e     key_dec_sel_o,
+  output logic             key_dec_we_o,          // Sparsify using multi-rail.
+  output logic             key_expand_en_o,       // Sparsify using multi-rail.
+  input  logic             key_expand_out_req_i,  // Sparsify using multi-rail.
+  output logic             key_expand_out_ack_o,  // Sparsify using multi-rail.
+  output logic             key_expand_clear_o,
+  output key_words_sel_e   key_words_sel_o,
+  output round_key_sel_e   round_key_sel_o,
+
+  // Register signals
+  input  logic [3:0]       rnd_ctr_q_i,
+  output logic [3:0]       rnd_ctr_d_o,
+  input  logic [3:0]       rnd_ctr_rem_q_i,
+  output logic [3:0]       rnd_ctr_rem_d_o,
+  input  logic [3:0]       num_rounds_q_i,
+  output logic [3:0]       num_rounds_d_o,
+  input  logic             crypt_q_i,             // Sparsify using multi-rail.
+  output logic             crypt_d_o,             // Sparsify using multi-rail.
+  input  logic             dec_key_gen_q_i,       // Sparsify using multi-rail.
+  output logic             dec_key_gen_d_o,       // Sparsify using multi-rail.
+  input  logic             key_clear_q_i,
+  output logic             key_clear_d_o,
+  input  logic             data_out_clear_q_i,
+  output logic             data_out_clear_d_o
+);
+
+  /////////////////////
+  // Input Buffering //
+  /////////////////////
+
+  localparam int NumInBufBits = $bits({
+    in_valid_i,
+    out_ready_i,
+    cfg_valid_i,
+    op_i,
+    key_len_i,
+    crypt_i,
+    dec_key_gen_i,
+    key_clear_i,
+    data_out_clear_i,
+    mux_sel_err_i,
+    sp_enc_err_i,
+    rnd_ctr_err_i,
+    prng_reseed_ack_i,
+    sub_bytes_out_req_i,
+    key_expand_out_req_i,
+    rnd_ctr_q_i,
+    rnd_ctr_rem_q_i,
+    num_rounds_q_i,
+    crypt_q_i,
+    dec_key_gen_q_i,
+    key_clear_q_i,
+    data_out_clear_q_i
+  });
+
+  logic [NumInBufBits-1:0] in, in_buf;
+
+  assign in = {
+    in_valid_i,
+    out_ready_i,
+    cfg_valid_i,
+    op_i,
+    key_len_i,
+    crypt_i,
+    dec_key_gen_i,
+    key_clear_i,
+    data_out_clear_i,
+    mux_sel_err_i,
+    sp_enc_err_i,
+    rnd_ctr_err_i,
+    prng_reseed_ack_i,
+    sub_bytes_out_req_i,
+    key_expand_out_req_i,
+    rnd_ctr_q_i,
+    rnd_ctr_rem_q_i,
+    num_rounds_q_i,
+    crypt_q_i,
+    dec_key_gen_q_i,
+    key_clear_q_i,
+    data_out_clear_q_i
+  };
+
+  // This primitive is used to place a size-only constraint on the
+  // buffers to act as a synthesis optimization barrier.
+  prim_buf #(
+    .Width(NumInBufBits)
+  ) u_prim_buf_in (
+    .in_i(in),
+    .out_o(in_buf)
+  );
+
+  logic             in_valid;
+  logic             out_ready;
+  logic             cfg_valid;
+  logic             op;
+  key_len_e         key_len;
+  logic             crypt;
+  logic             dec_key_gen;
+  logic             key_clear;
+  logic             data_out_clear;
+  logic             mux_sel_err;
+  logic             sp_enc_err;
+  logic             rnd_ctr_err;
+  logic             prng_reseed_ack;
+  logic             sub_bytes_out_req;
+  logic             key_expand_out_req;
+  logic [3:0]       rnd_ctr_q;
+  logic [3:0]       rnd_ctr_rem_q;
+  logic [3:0]       num_rounds_q;
+  logic             crypt_q;
+  logic             dec_key_gen_q;
+  logic             key_clear_q;
+  logic             data_out_clear_q;
+
+  assign {in_valid,
+          out_ready,
+          cfg_valid,
+          op,
+          key_len,
+          crypt,
+          dec_key_gen,
+          key_clear,
+          data_out_clear,
+          mux_sel_err,
+          sp_enc_err,
+          rnd_ctr_err,
+          prng_reseed_ack,
+          sub_bytes_out_req,
+          key_expand_out_req,
+          rnd_ctr_q,
+          rnd_ctr_rem_q,
+          num_rounds_q,
+          crypt_q,
+          dec_key_gen_q,
+          key_clear_q,
+          data_out_clear_q} = in_buf;
+
+  // Intermediate output signals
+  logic             in_ready;
+  logic             out_valid;
+  logic             alert;
+  logic             prng_update;
+  logic             prng_reseed_req;
+  state_sel_e       state_sel;
+  logic             state_we;
+  logic             sub_bytes_en;
+  logic             sub_bytes_out_ack;
+  add_rk_sel_e      add_rk_sel;
+  key_full_sel_e    key_full_sel;
+  logic             key_full_we;
+  key_dec_sel_e     key_dec_sel;
+  logic             key_dec_we;
+  logic             key_expand_en;
+  logic             key_expand_out_ack;
+  logic             key_expand_clear;
+  key_words_sel_e   key_words_sel;
+  round_key_sel_e   round_key_sel;
+  logic [3:0]       rnd_ctr_d;
+  logic [3:0]       rnd_ctr_rem_d;
+  logic [3:0]       num_rounds_d;
+  logic             crypt_d;
+  logic             dec_key_gen_d;
+  logic             key_clear_d;
+  logic             data_out_clear_d;
+
+  /////////////////
+  // Regular FSM //
+  /////////////////
+
+  aes_cipher_control_fsm #(
+    .Masking  ( Masking  ),
+    .SBoxImpl ( SBoxImpl )
+  ) u_aes_cipher_control_fsm (
+    .clk_i                 ( clk_i                  ),
+    .rst_ni                ( rst_ni                 ),
+
+    .in_valid_i            ( in_valid               ),
+    .in_ready_o            ( in_ready               ),
+
+    .out_valid_o           ( out_valid              ),
+    .out_ready_i           ( out_ready              ),
+
+    .cfg_valid_i           ( cfg_valid              ),
+    .op_i                  ( op                     ),
+    .key_len_i             ( key_len                ),
+    .crypt_i               ( crypt                  ),
+    .dec_key_gen_i         ( dec_key_gen            ),
+    .key_clear_i           ( key_clear              ),
+    .data_out_clear_i      ( data_out_clear         ),
+    .mux_sel_err_i         ( mux_sel_err            ),
+    .sp_enc_err_i          ( sp_enc_err             ),
+    .rnd_ctr_err_i         ( rnd_ctr_err            ),
+    .alert_o               ( alert                  ),
+
+    .prng_update_o         ( prng_update            ),
+    .prng_reseed_req_o     ( prng_reseed_req        ),
+    .prng_reseed_ack_i     ( prng_reseed_ack        ),
+
+    .state_sel_o           ( state_sel              ),
+    .state_we_o            ( state_we               ),
+    .sub_bytes_en_o        ( sub_bytes_en           ),
+    .sub_bytes_out_req_i   ( sub_bytes_out_req      ),
+    .sub_bytes_out_ack_o   ( sub_bytes_out_ack      ),
+    .add_rk_sel_o          ( add_rk_sel             ),
+
+    .key_full_sel_o        ( key_full_sel           ),
+    .key_full_we_o         ( key_full_we            ),
+    .key_dec_sel_o         ( key_dec_sel            ),
+    .key_dec_we_o          ( key_dec_we             ),
+    .key_expand_en_o       ( key_expand_en          ),
+    .key_expand_out_req_i  ( key_expand_out_req     ),
+    .key_expand_out_ack_o  ( key_expand_out_ack     ),
+    .key_expand_clear_o    ( key_expand_clear       ),
+    .key_words_sel_o       ( key_words_sel          ),
+    .round_key_sel_o       ( round_key_sel          ),
+
+    .rnd_ctr_q_i           ( rnd_ctr_q              ),
+    .rnd_ctr_d_o           ( rnd_ctr_d              ),
+    .rnd_ctr_rem_q_i       ( rnd_ctr_rem_q          ),
+    .rnd_ctr_rem_d_o       ( rnd_ctr_rem_d          ),
+    .num_rounds_q_i        ( num_rounds_q           ),
+    .num_rounds_d_o        ( num_rounds_d           ),
+    .crypt_q_i             ( crypt_q                ),
+    .crypt_d_o             ( crypt_d                ),
+    .dec_key_gen_q_i       ( dec_key_gen_q          ),
+    .dec_key_gen_d_o       ( dec_key_gen_d          ),
+    .key_clear_q_i         ( key_clear_q            ),
+    .key_clear_d_o         ( key_clear_d            ),
+    .data_out_clear_q_i    ( data_out_clear_q       ),
+    .data_out_clear_d_o    ( data_out_clear_d       )
+  );
+
+  //////////////////////
+  // Output Buffering //
+  //////////////////////
+
+  localparam int NumOutBufBits = $bits({
+    in_ready_o,
+    out_valid_o,
+    alert_o,
+    prng_update_o,
+    prng_reseed_req_o,
+    state_sel_o,
+    state_we_o,
+    sub_bytes_en_o,
+    sub_bytes_out_ack_o,
+    add_rk_sel_o,
+    key_full_sel_o,
+    key_full_we_o,
+    key_dec_sel_o,
+    key_dec_we_o,
+    key_expand_en_o,
+    key_expand_out_ack_o,
+    key_expand_clear_o,
+    key_words_sel_o,
+    round_key_sel_o,
+    rnd_ctr_d_o,
+    rnd_ctr_rem_d_o,
+    num_rounds_d_o,
+    crypt_d_o,
+    dec_key_gen_d_o,
+    key_clear_d_o,
+    data_out_clear_d_o
+  });
+
+  logic [NumOutBufBits-1:0] out, out_buf;
+
+  assign out = {
+    in_ready,
+    out_valid,
+    alert,
+    prng_update,
+    prng_reseed_req,
+    state_sel,
+    state_we,
+    sub_bytes_en,
+    sub_bytes_out_ack,
+    add_rk_sel,
+    key_full_sel,
+    key_full_we,
+    key_dec_sel,
+    key_dec_we,
+    key_expand_en,
+    key_expand_out_ack,
+    key_expand_clear,
+    key_words_sel,
+    round_key_sel,
+    rnd_ctr_d,
+    rnd_ctr_rem_d,
+    num_rounds_d,
+    crypt_d,
+    dec_key_gen_d,
+    key_clear_d,
+    data_out_clear_d
+  };
+
+  // This primitive is used to place a size-only constraint on the
+  // buffers to act as a synthesis optimization barrier.
+  prim_buf #(
+    .Width(NumOutBufBits)
+  ) u_prim_buf_out (
+    .in_i(out),
+    .out_o(out_buf)
+  );
+
+  assign {in_ready_o,
+          out_valid_o,
+          alert_o,
+          prng_update_o,
+          prng_reseed_req_o,
+          state_sel_o,
+          state_we_o,
+          sub_bytes_en_o,
+          sub_bytes_out_ack_o,
+          add_rk_sel_o,
+          key_full_sel_o,
+          key_full_we_o,
+          key_dec_sel_o,
+          key_dec_we_o,
+          key_expand_en_o,
+          key_expand_out_ack_o,
+          key_expand_clear_o,
+          key_words_sel_o,
+          round_key_sel_o,
+          rnd_ctr_d_o,
+          rnd_ctr_rem_d_o,
+          num_rounds_d_o,
+          crypt_d_o,
+          dec_key_gen_d_o,
+          key_clear_d_o,
+          data_out_clear_d_o} = out_buf;
+
+endmodule