[aes/rtl] Add Non-LUT-based S-Box

This commits adapts the design to select either the previous
LUT-based S-Box or a non-LUT-based Canright S-Box using a parameter.

Signed-off-by: Pirmin Vogel <vogelpi@lowrisc.org>
diff --git a/hw/ip/aes/aes.core b/hw/ip/aes/aes.core
index f6f9569..4dfb1ca 100644
--- a/hw/ip/aes/aes.core
+++ b/hw/ip/aes/aes.core
@@ -15,7 +15,9 @@
       - rtl/aes_reg_top.sv
       - rtl/aes_core.sv
       - rtl/aes_sub_bytes.sv
+      - rtl/aes_sbox.sv
       - rtl/aes_sbox_lut.sv
+      - rtl/aes_sbox_canright.sv
       - rtl/aes_shift_rows.sv
       - rtl/aes_mix_columns.sv
       - rtl/aes_mix_single_column.sv
diff --git a/hw/ip/aes/doc/_index.md b/hw/ip/aes/doc/_index.md
index d939ea8..b1feee8 100644
--- a/hw/ip/aes/doc/_index.md
+++ b/hw/ip/aes/doc/_index.md
@@ -155,13 +155,14 @@
 ### SubBytes / S-Box
 
 The SubBytes operation is a non-linear byte substitution that operates independently on each byte of the state using a substitution table (S-Box).
-
-The design of this S-Box and its inverse can have a big impact on circuit area, timing critical path, robustness and power leakage, and is itself its own research topic.
-The initial version of the AES unit uses a LUT-based S-Box implementation.
 It is both used for the cipher data path and the key expand data path.
 In total, 20 S-Boxes are used (16 for SubBytes, 4 for KeyExpand), each having 8-bit input and output.
 
+The design of this S-Box and its inverse can have a big impact on circuit area, timing critical path, robustness and power leakage, and is itself its own research topic.
+
 Since the S-Boxes can be decoupled from the rest of the AES unit, they can easily be replaced by a different implementation if required.
+The AES unit currently uses a LUT-based S-Box implementation (default) but also supports the implementation proposed by [Canright: "A very compact Rijndael S-Box"](https://hdl.handle.net/10945/25608) (selectable by a compile-time parameter).
+
 A possible candidate implementation that employs masking (i.e. that randomizes the power consumption of the AES unit in every cipher round) to aggravate power analysis attacks has been proposed by [Canright and Batina: “A very compact “perfectly masked” S-Box for AES (corrected)”](https://eprint.iacr.org/2009/011.pdf).
 
 
diff --git a/hw/ip/aes/rtl/aes.sv b/hw/ip/aes/rtl/aes.sv
index 64c21b1..5276a66 100644
--- a/hw/ip/aes/rtl/aes.sv
+++ b/hw/ip/aes/rtl/aes.sv
@@ -5,7 +5,8 @@
 // AES top-level wrapper
 
 module aes #(
-  parameter bit AES192Enable = 1
+  parameter bit AES192Enable = 1,    // Can be 0 (disable), or 1 (enable).
+  parameter     SBoxImpl     = "lut" // Can be "lut" (LUT-based SBox), or "canright".
 ) (
   input                     clk_i,
   input                     rst_ni,
@@ -31,7 +32,8 @@
   );
 
   aes_core #(
-    .AES192Enable (AES192Enable)
+    .AES192Enable ( AES192Enable ),
+    .SBoxImpl     ( SBoxImpl     )
   ) aes_core (
     .clk_i,
     .rst_ni,
diff --git a/hw/ip/aes/rtl/aes_core.sv b/hw/ip/aes/rtl/aes_core.sv
index c66e625..1b79db4 100644
--- a/hw/ip/aes/rtl/aes_core.sv
+++ b/hw/ip/aes/rtl/aes_core.sv
@@ -5,7 +5,8 @@
 // AES core implementation
 
 module aes_core #(
-  parameter bit AES192Enable = 1
+  parameter bit AES192Enable = 1,
+  parameter     SBoxImpl     = "lut"
 ) (
   input                            clk_i,
   input                            rst_ni,
@@ -146,7 +147,9 @@
   end
 
   // Cipher data path
-  aes_sub_bytes aes_sub_bytes (
+  aes_sub_bytes #(
+  .SBoxImpl     ( SBoxImpl )
+  ) aes_sub_bytes (
     .mode_i ( mode_q        ),
     .data_i ( state_q       ),
     .data_o ( sub_bytes_out )
@@ -238,7 +241,8 @@
 
   // Key expand data path
   aes_key_expand #(
-  .AES192Enable (AES192Enable)
+  .AES192Enable ( AES192Enable ),
+  .SBoxImpl     ( SBoxImpl     )
   ) aes_key_expand (
     .clk_i     ( clk_i            ),
     .rst_ni    ( rst_ni           ),
diff --git a/hw/ip/aes/rtl/aes_key_expand.sv b/hw/ip/aes/rtl/aes_key_expand.sv
index d15ea42..c8db355 100644
--- a/hw/ip/aes/rtl/aes_key_expand.sv
+++ b/hw/ip/aes/rtl/aes_key_expand.sv
@@ -5,7 +5,8 @@
 // AES KeyExpand
 
 module aes_key_expand #(
-  parameter bit AES192Enable = 1
+  parameter bit AES192Enable = 1,
+  parameter     SBoxImpl     = "lut"
 ) (
   input  logic              clk_i,
   input  logic              rst_ni,
@@ -165,7 +166,9 @@
 
   // SubWord - individually substitute bytes
   for (genvar i = 0; i < 4; i++) begin : gen_sbox
-    aes_sbox_lut aes_sbox_i (
+    aes_sbox #(
+      .SBoxImpl ( SBoxImpl )
+    ) aes_sbox_i (
       .mode_i ( AES_ENC   ),
       .data_i ( sub_word_in[8*i +: 8]  ),
       .data_o ( sub_word_out[8*i +: 8] )
diff --git a/hw/ip/aes/rtl/aes_pkg.sv b/hw/ip/aes/rtl/aes_pkg.sv
index ea275b5..56f2129 100644
--- a/hw/ip/aes/rtl/aes_pkg.sv
+++ b/hw/ip/aes/rtl/aes_pkg.sv
@@ -118,4 +118,19 @@
   end
 endfunction
 
+// Matrix-vector multiplication in GF(2^8): c = A * b
+function automatic logic [7:0] aes_mvm(
+  input logic [7:0] vec_b,
+  input logic [7:0] mat_a [8]
+);
+  logic [7:0] vec_c;
+  vec_c = '0;
+  for (int i=0; i<8; i++) begin
+    for (int j=0; j<8; j++) begin
+      vec_c[i] = vec_c[i] ^ (mat_a[j][i] & vec_b[7-j]);
+    end
+  end
+  return vec_c;
+endfunction
+
 endpackage
diff --git a/hw/ip/aes/rtl/aes_sbox.sv b/hw/ip/aes/rtl/aes_sbox.sv
new file mode 100644
index 0000000..04266aa
--- /dev/null
+++ b/hw/ip/aes/rtl/aes_sbox.sv
@@ -0,0 +1,29 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// AES SBox
+
+module aes_sbox #(
+  parameter SBoxImpl = "lut"
+) (
+  input  aes_pkg::mode_e mode_i,
+  input  logic [7:0]     data_i,
+  output logic [7:0]     data_o
+);
+
+  if (SBoxImpl == "lut") begin : gen_sbox_lut
+    aes_sbox_lut aes_sbox (
+      .mode_i,
+      .data_i,
+      .data_o
+    );
+  end else if (SBoxImpl == "canright") begin : gen_sbox_canright
+    aes_sbox_canright aes_sbox (
+      .mode_i,
+      .data_i,
+      .data_o
+    );
+  end
+
+endmodule
diff --git a/hw/ip/aes/rtl/aes_sbox_canright.sv b/hw/ip/aes/rtl/aes_sbox_canright.sv
new file mode 100644
index 0000000..0995532
--- /dev/null
+++ b/hw/ip/aes/rtl/aes_sbox_canright.sv
@@ -0,0 +1,144 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// AES Canright SBox #4
+//
+// For details, see the technical report: Canright, "A very compact Rijndael S-box"
+// available at https://hdl.handle.net/10945/25608
+
+module aes_sbox_canright (
+  input  aes_pkg::mode_e mode_i,
+  input  logic [7:0]     data_i,
+  output logic [7:0]     data_o
+);
+
+  import aes_pkg::*;
+
+  ///////////////////////////
+  // Functions & Constants //
+  ///////////////////////////
+
+  // Multiplication in GF(2^2), using normal basis [Omega^2, Omega]
+  // (see Figure 14 in the technical report)
+  function automatic logic [1:0] aes_mul_gf2p2(input logic [1:0] g, input logic [1:0] d);
+    logic [1:0] f;
+    logic       a, b, c;
+    a    = g[1] & d[1];
+    b    = (^g) & (^d);
+    c    = g[0] & d[0];
+    f[1] = a ^ b;
+    f[0] = c ^ b;
+    return f;
+  endfunction
+
+  // Scale by Omega^2 = N in GF(2^2), using normal basis [Omega^2, Omega]
+  // (see Figure 16 in the technical report)
+  function automatic logic [1:0] aes_scale_omega2_gf2p2(input logic [1:0] g);
+    logic [1:0] d;
+    d[1] = g[0];
+    d[0] = g[1] ^ g[0];
+    return d;
+  endfunction
+
+  // Scale by Omega = N^2 in GF(2^2), using normal basis [Omega^2, Omega]
+  // (see Figure 15 in the technical report)
+  function automatic logic [1:0] aes_scale_omega_gf2p2(input logic [1:0] g);
+    logic [1:0] d;
+    d[1] = g[1] ^ g[0];
+    d[0] = g[1];
+    return d;
+  endfunction
+
+  // Square in GF(2^2), using normal basis [Omega^2, Omega]
+  // (see Figures 8 and 10 in the technical report)
+  function automatic logic [1:0] aes_square_gf2p2(input logic [1:0] g);
+    logic [1:0] d;
+    d[1] = g[0];
+    d[0] = g[1];
+    return d;
+  endfunction
+
+  // Multiplication in GF(2^4), using normal basis [alpha^8, alpha^2]
+  // (see Figure 13 in the technical report)
+  function automatic logic [3:0] aes_mul_gf2p4(input logic [3:0] gamma, input logic [3:0] delta);
+    logic [3:0] theta;
+    logic [1:0] a, b, c;
+    a          = aes_mul_gf2p2(gamma[3:2], delta[3:2]);
+    b          = aes_mul_gf2p2(gamma[3:2] ^ gamma[1:0], delta[3:2] ^ delta[1:0]);
+    c          = aes_mul_gf2p2(gamma[1:0], delta[1:0]);
+    theta[3:2] = a ^ aes_scale_omega2_gf2p2(b);
+    theta[1:0] = c ^ aes_scale_omega2_gf2p2(b);
+    return theta;
+  endfunction
+
+  // Square and scale by nu in GF(2^4)/GF(2^2), using normal basis [alpha^8, alpha^2]
+  // (see Figure 19 as well as Appendix A of the technical report)
+  function automatic logic [3:0] aes_square_scale_gf2p4_gf2p2(input logic [3:0] gamma);
+    logic [3:0] delta;
+    logic [1:0] a, b;
+    a          = gamma[3:2] ^ gamma[1:0];
+    b          = aes_square_gf2p2(gamma[1:0]);
+    delta[3:2] = aes_square_gf2p2(a);
+    delta[1:0] = aes_scale_omega_gf2p2(b);
+    return delta;
+  endfunction
+
+  // Inverse in GF(2^4), using normal basis [alpha^8, alpha^2]
+  // (see Figure 12 in the technical report)
+  function automatic logic [3:0] aes_inverse_gf2p4(input logic [3:0] gamma);
+    logic [3:0] delta;
+    logic [1:0] a, b, c, d;
+    a          = gamma[3:2] ^ gamma[1:0];
+    b          = aes_mul_gf2p2(gamma[3:2], gamma[1:0]);
+    c          = aes_scale_omega2_gf2p2(aes_square_gf2p2(a));
+    d          = aes_square_gf2p2(c ^ b);
+    delta[3:2] = aes_mul_gf2p2(d, gamma[1:0]);
+    delta[1:0] = aes_mul_gf2p2(d, gamma[3:2]);
+    return delta;
+  endfunction
+
+  // Inverse in GF(2^8), using normal basis [d^16, d]
+  // (see Figure 11 in the technical report)
+  function automatic logic [7:0] aes_inverse_gf2p8(input logic [7:0] gamma);
+    logic [7:0] delta;
+    logic [3:0] a, b, c, d;
+    a          = gamma[7:4] ^ gamma[3:0];
+    b          = aes_mul_gf2p4(gamma[7:4], gamma[3:0]);
+    c          = aes_square_scale_gf2p4_gf2p2(a);
+    d          = aes_inverse_gf2p4(c ^ b);
+    delta[7:4] = aes_mul_gf2p4(d, gamma[3:0]);
+    delta[3:0] = aes_mul_gf2p4(d, gamma[7:4]);
+    return delta;
+  endfunction
+
+  // Basis conversion matrices to convert between polynomial basis A, normal basis X
+  // and basis S incorporating the bit matrix of the SBox. More specifically,
+  // multiplication by x2s performs the transformation from normal basis X into
+  // polynomial basis A, followed by the affine transformation (substep 2). Likewise,
+  // multiplication by s2x performs the inverse affine transformation followed by the
+  // transformation from polynomial basis A to normal basis X.
+  // (see Appendix A of the technical report)
+  const logic [7:0] a2x [8] = '{8'h98, 8'hf3, 8'hf2, 8'h48, 8'h09, 8'h81, 8'ha9, 8'hff};
+  const logic [7:0] x2a [8] = '{8'h64, 8'h78, 8'h6e, 8'h8c, 8'h68, 8'h29, 8'hde, 8'h60};
+  const logic [7:0] x2s [8] = '{8'h58, 8'h2d, 8'h9e, 8'h0b, 8'hdc, 8'h04, 8'h03, 8'h24};
+  const logic [7:0] s2x [8] = '{8'h8c, 8'h79, 8'h05, 8'heb, 8'h12, 8'h04, 8'h51, 8'h53};
+
+  ///////////////////
+  // Canright SBox //
+  ///////////////////
+
+  logic [7:0] data_basis_x, data_inverse;
+
+  // Convert to normal basis X.
+  assign data_basis_x = (mode_i == AES_ENC) ? aes_mvm(data_i, a2x) :
+                                              aes_mvm(data_i ^ 8'h63, s2x);
+
+  // Do the inversion in normal basis X.
+  assign data_inverse = aes_inverse_gf2p8(data_basis_x);
+
+  // Convert to basis S or A.
+  assign data_o       = (mode_i == AES_ENC) ? aes_mvm(data_inverse, x2s) ^ 8'h63 :
+                                              aes_mvm(data_inverse, x2a);
+
+endmodule
diff --git a/hw/ip/aes/rtl/aes_sub_bytes.sv b/hw/ip/aes/rtl/aes_sub_bytes.sv
index fec91aa..9890e3b 100644
--- a/hw/ip/aes/rtl/aes_sub_bytes.sv
+++ b/hw/ip/aes/rtl/aes_sub_bytes.sv
@@ -4,7 +4,9 @@
 //
 // AES SubBytes
 
-module aes_sub_bytes (
+module aes_sub_bytes #(
+  parameter SBoxImpl = "lut"
+) (
   input  aes_pkg::mode_e       mode_i,
   input  logic [3:0][3:0][7:0] data_i,
   output logic [3:0][3:0][7:0] data_o
@@ -13,7 +15,9 @@
   // Individually substitute bytes
   for (genvar j = 0; j < 4; j++) begin : gen_sbox_j
     for (genvar i = 0; i < 4; i++) begin : gen_sbox_i
-      aes_sbox_lut aes_sbox_ij (
+      aes_sbox #(
+        .SBoxImpl ( SBoxImpl )
+      ) aes_sbox_ij (
         .mode_i ( mode_i       ),
         .data_i ( data_i[i][j] ),
         .data_o ( data_o[i][j] )