hw/ip/prim/rtl/prim_gf_mult.sv - 3p/lowrisc/opentitan - Git at Google

 // Copyright lowRISC contributors.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
 // This module performs a the multiplication of two operands in Galois field GF(2^Width) modulo the
 // provided irreducible polynomial using a parallel Mastrovito multipler [3]. To cut long paths
 // potentially occurring for large data widths, the implementation provides a parameter
 // StagesPerCycle to decompose the multiplication into Width/StagesPerCycle iterative steps
 // (Digit-Serial/Parallel Multiplier [4]).
 //
 // Note that this module is not pipelined and produces an output sample every Width/StagesPerCycle
 // cycles.
 //
 // References:
 //
 // [1] Patel, "Parallel Multiplier Designs for the Galois/Counter Mode of Operation",
 // https://pdfs.semanticscholar.org/1246/a9ad98dc0421ccfc945e6529c886f23e848d.pdf
 // [2] Wagner, "The Laws of Cryptography: The Finite Field GF(2^8)",
 // http://www.cs.utsa.edu/~wagner/laws/FFM.html
 //
 // [3]: Mastrovito, "VLSI Designs for Multiplication over Finite Fields GF(2^m)",
 // https://link.springer.com/chapter/10.1007/3-540-51083-4_67
 // [4]: Song et al., "Efficient Finite Field Serial/Parallel Multiplication",
 // https://ieeexplore.ieee.org/document/542803


 `include "prim_assert.sv"

 module prim_gf_mult #(
   parameter int Width = 32,
   parameter int StagesPerCycle = Width,

   // The field-generating, irreducible polynomial of degree Width.
   // Can for example be a Conway polynomial, see
   // http://www.math.rwth-aachen.de/~Frank.Luebeck/data/ConwayPol/CP2.html
   // For Width = 33, the Conway polynomial has bits 32, 15, 9, 7, 4, 3, 0 set to one.
   parameter logic[Width-1:0] IPoly = Width'(1'b1) << 15 |
                                      Width'(1'b1) << 9  |
                                      Width'(1'b1) << 7  |
                                      Width'(1'b1) << 4  |
                                      Width'(1'b1) << 3  |
                                      Width'(1'b1) << 0
 ) (
   input clk_i,
   input rst_ni,
   input req_i,
   input [Width-1:0] operand_a_i,
   input [Width-1:0] operand_b_i,
   output logic ack_o,
   output logic [Width-1:0] prod_o
 );

   `ASSERT_INIT(IntegerLoops_A, (Width % StagesPerCycle) == 0)
   `ASSERT_INIT(StagePow2_A, $onehot(StagesPerCycle))

   localparam int Loops = Width / StagesPerCycle;
   localparam int CntWidth = (Loops == 1) ? 1 : $clog2(Loops);

   // reformat operand_b_i
   logic [Loops-1:0][StagesPerCycle-1:0] reformat_data;

   // this slice of operand bits used during each loop
   logic [StagesPerCycle-1:0] op_i_slice;

   // the matrix is made up of a series of GF(2^Width) * x
   logic [StagesPerCycle-1:0][Width-1:0] matrix;

   // since the matrix generation is not done in one go, we must remember
   // where it last left off
   logic [Width-1:0] vector;

   // this variable tracks which loop we are currently operating
   logic [CntWidth-1:0] cnt;

   // this variable tracks the first loop through the multiply
   logic first;

   // intermediate prod held between loops
   logic [Width-1:0] prod_q, prod_d;

   // select current slice
   assign reformat_data = operand_b_i;
   assign op_i_slice = reformat_data[cnt];

   assign first = cnt == 0;

   if (StagesPerCycle == Width) begin : gen_all_combo

     assign ack_o = 1'b1;
     assign cnt = '0;
     assign prod_q = '0;
     assign vector = '0;

   end else begin : gen_decomposed

     // multiply is done
     assign ack_o = int'(cnt) == (Loops - 1);

     // advance the stage count and also advance the bit position count
     always_ff @(posedge clk_i or negedge rst_ni) begin
       if (!rst_ni) begin
         cnt <= '0;
       end else if (req_i && ack_o) begin
         cnt <= '0;
       end else if (req_i && int'(cnt) < (Loops - 1)) begin
         cnt <= cnt + 1'b1;
       end
     end

     always_ff @(posedge clk_i or negedge rst_ni) begin
       if (!rst_ni) begin
         prod_q <= '0;
         vector <= '0;
       end else if (ack_o) begin
         prod_q <= '0;
         vector <= '0;
       end else if (req_i) begin
         prod_q <= prod_d;
         vector <= matrix[StagesPerCycle-1];
       end
     end
   end


   assign matrix = first ? gen_matrix(operand_a_i, 1'b1) : gen_matrix(vector, 1'b0);
   assign prod_d = prod_q ^ gf_mult(matrix, op_i_slice);

   // The output is not toggled until it is ready
   assign prod_o = ack_o ? prod_d : operand_a_i;


   // GF(2^Width) * x
   function automatic logic [Width-1:0] gf_mult2(
     logic [Width-1:0] operand
   );
     logic [Width-1:0] mult_out;
     mult_out = operand[Width-1] ? (operand << 1) ^ IPoly : (operand << 1);
     return mult_out;
   endfunction

   // Matrix generate step
   function automatic logic [StagesPerCycle-1:0][Width-1:0] gen_matrix(
     logic [Width-1:0] seed,
     logic init
   );
     logic [StagesPerCycle-1:0][Width-1:0] matrix_out;

     matrix_out[0] = init ? seed : gf_mult2(seed);
     matrix_out[StagesPerCycle-1:1] = '0;
     for (int i = 1; i < StagesPerCycle; i++) begin
       matrix_out[i] = gf_mult2(matrix_out[i-1]);
     end
     return matrix_out;
   endfunction

   // Galois multiply step
   function automatic logic [Width-1:0] gf_mult(
     logic [StagesPerCycle-1:0][Width-1:0] matrix_,
     logic [StagesPerCycle-1:0] operand
   );
     logic [Width-1:0] mult_out;
     logic [Width-1:0] add_vector;
     mult_out = '0;
     for (int i = 0; i < StagesPerCycle; i++) begin
       add_vector = operand[i] ? matrix_[i] : '0;
       mult_out = mult_out ^ add_vector;
     end
     return mult_out;
   endfunction // gf_mult

 endmodule // prim_gf_mult
	// Copyright lowRISC contributors.
	// Licensed under the Apache License, Version 2.0, see LICENSE for details.
	// SPDX-License-Identifier: Apache-2.0
	//
	// This module performs a the multiplication of two operands in Galois field GF(2^Width) modulo the
	// provided irreducible polynomial using a parallel Mastrovito multipler [3]. To cut long paths
	// potentially occurring for large data widths, the implementation provides a parameter
	// StagesPerCycle to decompose the multiplication into Width/StagesPerCycle iterative steps
	// (Digit-Serial/Parallel Multiplier [4]).
	//
	// Note that this module is not pipelined and produces an output sample every Width/StagesPerCycle
	// cycles.
	//
	// References:
	//
	// [1] Patel, "Parallel Multiplier Designs for the Galois/Counter Mode of Operation",
	// https://pdfs.semanticscholar.org/1246/a9ad98dc0421ccfc945e6529c886f23e848d.pdf
	// [2] Wagner, "The Laws of Cryptography: The Finite Field GF(2^8)",
	// http://www.cs.utsa.edu/~wagner/laws/FFM.html
	//
	// [3]: Mastrovito, "VLSI Designs for Multiplication over Finite Fields GF(2^m)",
	// https://link.springer.com/chapter/10.1007/3-540-51083-4_67
	// [4]: Song et al., "Efficient Finite Field Serial/Parallel Multiplication",
	// https://ieeexplore.ieee.org/document/542803


	`include "prim_assert.sv"

	module prim_gf_mult #(
	parameter int Width = 32,
	parameter int StagesPerCycle = Width,

	// The field-generating, irreducible polynomial of degree Width.
	// Can for example be a Conway polynomial, see
	// http://www.math.rwth-aachen.de/~Frank.Luebeck/data/ConwayPol/CP2.html
	// For Width = 33, the Conway polynomial has bits 32, 15, 9, 7, 4, 3, 0 set to one.
	parameter logic[Width-1:0] IPoly = Width'(1'b1) << 15 \|
	Width'(1'b1) << 9 \|
	Width'(1'b1) << 7 \|
	Width'(1'b1) << 4 \|
	Width'(1'b1) << 3 \|
	Width'(1'b1) << 0
	) (
	input clk_i,
	input rst_ni,
	input req_i,
	input [Width-1:0] operand_a_i,
	input [Width-1:0] operand_b_i,
	output logic ack_o,
	output logic [Width-1:0] prod_o
	);

	`ASSERT_INIT(IntegerLoops_A, (Width % StagesPerCycle) == 0)
	`ASSERT_INIT(StagePow2_A, $onehot(StagesPerCycle))

	localparam int Loops = Width / StagesPerCycle;
	localparam int CntWidth = (Loops == 1) ? 1 : $clog2(Loops);

	// reformat operand_b_i
	logic [Loops-1:0][StagesPerCycle-1:0] reformat_data;

	// this slice of operand bits used during each loop
	logic [StagesPerCycle-1:0] op_i_slice;

	// the matrix is made up of a series of GF(2^Width) * x
	logic [StagesPerCycle-1:0][Width-1:0] matrix;

	// since the matrix generation is not done in one go, we must remember
	// where it last left off
	logic [Width-1:0] vector;

	// this variable tracks which loop we are currently operating
	logic [CntWidth-1:0] cnt;

	// this variable tracks the first loop through the multiply
	logic first;

	// intermediate prod held between loops
	logic [Width-1:0] prod_q, prod_d;

	// select current slice
	assign reformat_data = operand_b_i;
	assign op_i_slice = reformat_data[cnt];

	assign first = cnt == 0;

	if (StagesPerCycle == Width) begin : gen_all_combo

	assign ack_o = 1'b1;
	assign cnt = '0;
	assign prod_q = '0;
	assign vector = '0;

	end else begin : gen_decomposed

	// multiply is done
	assign ack_o = int'(cnt) == (Loops - 1);

	// advance the stage count and also advance the bit position count
	always_ff @(posedge clk_i or negedge rst_ni) begin
	if (!rst_ni) begin
	cnt <= '0;
	end else if (req_i && ack_o) begin
	cnt <= '0;
	end else if (req_i && int'(cnt) < (Loops - 1)) begin
	cnt <= cnt + 1'b1;
	end
	end

	always_ff @(posedge clk_i or negedge rst_ni) begin
	if (!rst_ni) begin
	prod_q <= '0;
	vector <= '0;
	end else if (ack_o) begin
	prod_q <= '0;
	vector <= '0;
	end else if (req_i) begin
	prod_q <= prod_d;
	vector <= matrix[StagesPerCycle-1];
	end
	end
	end


	assign matrix = first ? gen_matrix(operand_a_i, 1'b1) : gen_matrix(vector, 1'b0);
	assign prod_d = prod_q ^ gf_mult(matrix, op_i_slice);

	// The output is not toggled until it is ready
	assign prod_o = ack_o ? prod_d : operand_a_i;


	// GF(2^Width) * x
	function automatic logic [Width-1:0] gf_mult2(
	logic [Width-1:0] operand
	);
	logic [Width-1:0] mult_out;
	mult_out = operand[Width-1] ? (operand << 1) ^ IPoly : (operand << 1);
	return mult_out;
	endfunction

	// Matrix generate step
	function automatic logic [StagesPerCycle-1:0][Width-1:0] gen_matrix(
	logic [Width-1:0] seed,
	logic init
	);
	logic [StagesPerCycle-1:0][Width-1:0] matrix_out;

	matrix_out[0] = init ? seed : gf_mult2(seed);
	matrix_out[StagesPerCycle-1:1] = '0;
	for (int i = 1; i < StagesPerCycle; i++) begin
	matrix_out[i] = gf_mult2(matrix_out[i-1]);
	end
	return matrix_out;
	endfunction

	// Galois multiply step
	function automatic logic [Width-1:0] gf_mult(
	logic [StagesPerCycle-1:0][Width-1:0] matrix_,
	logic [StagesPerCycle-1:0] operand
	);
	logic [Width-1:0] mult_out;
	logic [Width-1:0] add_vector;
	mult_out = '0;
	for (int i = 0; i < StagesPerCycle; i++) begin
	add_vector = operand[i] ? matrix_[i] : '0;
	mult_out = mult_out ^ add_vector;
	end
	return mult_out;
	endfunction // gf_mult

	endmodule // prim_gf_mult