[flash] Add scrambling primitive and hook-up to program / read pipelines

Signed-off-by: Timothy Chen <timothytim@google.com>

[flash] Pick correct buffer to update after de-scramble

Signed-off-by: Timothy Chen <timothytim@google.com>

[flash] update to multi-cycle prince and various fixes

Signed-off-by: Timothy Chen <timothytim@google.com>
diff --git a/hw/ip/flash_ctrl/data/flash_ctrl.hjson b/hw/ip/flash_ctrl/data/flash_ctrl.hjson
index 1edaaee..cf426c9 100644
--- a/hw/ip/flash_ctrl/data/flash_ctrl.hjson
+++ b/hw/ip/flash_ctrl/data/flash_ctrl.hjson
@@ -20,7 +20,15 @@
       name:    "flash",          // flash_o (req), flash_i (rsp)
       act:     "req",
       package: "flash_ctrl_pkg", // Origin package (only needs for the requester)
+    },
+
+    { struct: "otp_flash",
+      type: "uni",
+      name: "otp",
+      act:  "rcv",
+      package: "flash_ctrl_pkg"
     }
+
   ],
 
   param_list: [
@@ -187,6 +195,23 @@
       ]
     },
 
+    { name: "SCRAMBLE_EN",
+      desc: "Scramble enable for flash",
+      swaccess: "rw",
+      hwaccess: "hro",
+      resval: "0",
+      fields: [
+        { bits: "0",
+          name: "VAL",
+          desc: '''
+            Temporary enable bit for flash scramble.
+            See #2630.
+            '''
+          resval: "0"
+        },
+      ]
+    },
+
 // TODO(#1412):
 // This multireg is temporarily removed until the nested multireg compact feature is fully implemented.
 // Until then, use only one register wen for all flash regions.
diff --git a/hw/ip/flash_ctrl/flash_ctrl.core b/hw/ip/flash_ctrl/flash_ctrl.core
index bc04143..01519b3 100644
--- a/hw/ip/flash_ctrl/flash_ctrl.core
+++ b/hw/ip/flash_ctrl/flash_ctrl.core
@@ -3,7 +3,7 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 name: "lowrisc:ip:flash_ctrl:0.1"
-description: "Faux Flash Controller"
+description: "Flash Controller"
 
 filesets:
   files_rtl:
@@ -11,6 +11,7 @@
       - lowrisc:ip:tlul
       - lowrisc:prim:all
       - lowrisc:prim:flash
+      - lowrisc:prim:gf_mult
       - lowrisc:ip:flash_ctrl_pkg
     files:
       - rtl/flash_ctrl_reg_pkg.sv
@@ -25,6 +26,7 @@
       - rtl/flash_phy_rd.sv
       - rtl/flash_phy_prog.sv
       - rtl/flash_phy_rd_buffers.sv
+      - rtl/flash_phy_scramble.sv
     file_type: systemVerilogSource
 
   files_verilator_waiver:
diff --git a/hw/ip/flash_ctrl/rtl/flash_ctrl.sv b/hw/ip/flash_ctrl/rtl/flash_ctrl.sv
index f8bae22..c1a5591 100644
--- a/hw/ip/flash_ctrl/rtl/flash_ctrl.sv
+++ b/hw/ip/flash_ctrl/rtl/flash_ctrl.sv
@@ -20,6 +20,9 @@
   input        flash_rsp_t flash_i,
   output       flash_req_t flash_o,
 
+  // OTP Interface
+  input        otp_flash_t otp_i,
+
   // Interrupts
   output logic intr_prog_empty_o, // Program fifo is empty
   output logic intr_prog_lvl_o,   // Program fifo is empty
@@ -388,6 +391,9 @@
   assign flash_o.part = flash_part_sel;
   assign flash_o.prog_data = flash_prog_data;
   assign flash_o.prog_last = flash_prog_last;
+  assign flash_o.scramble_en = reg2hw.scramble_en.q;
+  assign flash_o.addr_key = otp_i.addr_key;
+  assign flash_o.data_key = otp_i.data_key;
   assign flash_rd_data = flash_i.rd_data;
   assign init_busy = flash_i.init_busy;
 
diff --git a/hw/ip/flash_ctrl/rtl/flash_ctrl_pkg.sv b/hw/ip/flash_ctrl/rtl/flash_ctrl_pkg.sv
index 52be4e8..f8351f6 100644
--- a/hw/ip/flash_ctrl/rtl/flash_ctrl_pkg.sv
+++ b/hw/ip/flash_ctrl/rtl/flash_ctrl_pkg.sv
@@ -77,6 +77,9 @@
     logic [BusAddrW-1:0]  addr;
     logic [BusWidth-1:0]  prog_data;
     logic                 prog_last;
+    logic                 scramble_en;
+    logic [127:0]         addr_key;
+    logic [127:0]         data_key;
   } flash_req_t;
 
   // default value of flash_req_t (for dangling ports)
@@ -89,7 +92,10 @@
     part:      DataPart,
     addr:      '0,
     prog_data: '0,
-    prog_last: '0
+    prog_last: '0,
+    scramble_en: '0,
+    addr_key:  128'hDEADBEEFBEEFFACEDEADBEEF5A5AA5A5,
+    data_key:  128'hDEADBEEF5A5AA5A5DEADBEEFBEEFFACE
   };
 
   // memory to flash controller
@@ -110,4 +116,22 @@
     init_busy:  1'b0
   };
 
+  ////////////////////////////
+  // The following inter-module should be moved to OTP
+  ////////////////////////////
+
+  // otp to flash_phy
+  typedef struct packed {
+    logic [127:0] addr_key;
+    logic [127:0] data_key;
+  } otp_flash_t;
+
+  // default value of otp_flash_t
+  parameter otp_flash_t OTP_FLASH_DEFAULT = '{
+    addr_key: 128'hDEADBEEFBEEFFACEDEADBEEF5A5AA5A5,
+    data_key: 128'hDEADBEEF5A5AA5A5DEADBEEFBEEFFACE
+  };
+
+
+
 endpackage : flash_ctrl_pkg
diff --git a/hw/ip/flash_ctrl/rtl/flash_phy.sv b/hw/ip/flash_ctrl/rtl/flash_phy.sv
index 87f05b7..66b70a5 100644
--- a/hw/ip/flash_ctrl/rtl/flash_phy.sv
+++ b/hw/ip/flash_ctrl/rtl/flash_phy.sv
@@ -118,13 +118,27 @@
       .rdata  (host_rsp_data[bank])
     );
 
+    logic host_req;
+    logic ctrl_req;
+    logic host_scramble_en;
+    logic ctrl_scramble_en;
+
+    assign host_req = host_req_i & (host_bank_sel == bank) & host_rsp_avail[bank];
+    assign ctrl_req = flash_ctrl_i.req & (ctrl_bank_sel == bank);
+
+    // #2630: Temporary scramble enable logic on one of the banks until register configuration
+    // is setup.
+    assign host_scramble_en = host_req & host_addr_i[BusAddrW-1 -: BankW] == 1;
+    assign ctrl_scramble_en = ctrl_req & flash_ctrl_i.addr[BusAddrW-1 -: BankW] == 1;
+
     flash_phy_core i_core (
       .clk_i,
       .rst_ni,
-      .req_i(flash_ctrl_i.req & (ctrl_bank_sel == bank)),
+      .scramble_en_i(flash_ctrl_i.scramble_en & (host_scramble_en | ctrl_scramble_en)),
+      .req_i(ctrl_req),
       // host request must be suppressed if response fifo cannot hold more
       // otherwise the flash_phy_core and flash_phy will get out of sync
-      .host_req_i(host_req_i & (host_bank_sel == bank) & host_rsp_avail[bank]),
+      .host_req_i(host_req),
       .host_addr_i(host_addr_i[0 +: BusBankAddrW]),
       .rd_i(flash_ctrl_i.rd),
       .prog_i(flash_ctrl_i.prog),
@@ -134,6 +148,8 @@
       .addr_i(flash_ctrl_i.addr[0 +: BusBankAddrW]),
       .prog_data_i(flash_ctrl_i.prog_data),
       .prog_last_i(flash_ctrl_i.prog_last),
+      .addr_key_i(flash_ctrl_i.addr_key),
+      .data_key_i(flash_ctrl_i.data_key),
       .host_req_rdy_o(host_req_rdy[bank]),
       .host_req_done_o(host_req_done[bank]),
       .rd_done_o(rd_done[bank]),
diff --git a/hw/ip/flash_ctrl/rtl/flash_phy_core.sv b/hw/ip/flash_ctrl/rtl/flash_phy_core.sv
index a04d48b..ddc0518 100644
--- a/hw/ip/flash_ctrl/rtl/flash_phy_core.sv
+++ b/hw/ip/flash_ctrl/rtl/flash_phy_core.sv
@@ -15,9 +15,10 @@
 ) (
   input                              clk_i,
   input                              rst_ni,
-  input                              host_req_i, // host request - read only
+  input                              scramble_en_i,// temporary signal
+  input                              host_req_i,   // host request - read only
   input [BusBankAddrW-1:0]           host_addr_i,
-  input                              req_i,      // controller request
+  input                              req_i,        // controller request
   input                              rd_i,
   input                              prog_i,
   input                              pg_erase_i,
@@ -26,6 +27,8 @@
   input [BusBankAddrW-1:0]           addr_i,
   input [BusWidth-1:0]               prog_data_i,
   input                              prog_last_i,
+  input [KeySize-1:0]                addr_key_i,
+  input [KeySize-1:0]                data_key_i,
   output logic                       host_req_rdy_o,
   output logic                       host_req_done_o,
   output logic                       rd_done_o,
@@ -85,6 +88,11 @@
   logic inc_arb_cnt, clr_arb_cnt;
   logic host_req_masked;
 
+  // scramble / de-scramble connections
+  logic calc_ack;
+  logic op_ack;
+  logic [DataWidth-1:0] scramble_mask;
+
   assign host_req_masked = host_req_i & (arb_cnt < ArbCnt);
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
@@ -211,11 +219,17 @@
 
   logic flash_rd_req;
   logic [DataWidth-1:0] flash_rdata;
+  logic rd_calc_req;
+  logic [BankAddrW-1:0] rd_calc_addr;
+  logic rd_op_req;
+  logic [DataWidth-1:0] rd_scrambled_data;
+  logic [DataWidth-1:0] rd_descrambled_data;
 
   flash_phy_rd u_rd (
     .clk_i,
     .rst_ni,
     .req_i(reqs[PhyRead]),
+    .descramble_i(scramble_en_i),
     .prog_i(reqs[PhyProg]),
     .pg_erase_i(reqs[PhyPgErase]),
     .bk_erase_i(reqs[PhyBkErase]),
@@ -227,16 +241,26 @@
     .idle_o(rd_stage_idle),
     .req_o(flash_rd_req),
     .ack_i(ack),
-    .data_i(flash_rdata)
+    .data_i(flash_rdata),
+    //scramble unit interface
+    .calc_req_o(rd_calc_req),
+    .calc_addr_o(rd_calc_addr),
+    .descramble_req_o(rd_op_req),
+    .scrambled_data_o(rd_scrambled_data),
+    .calc_ack_i(calc_ack),
+    .descramble_ack_i(op_ack),
+    .mask_i(scramble_mask),
+    .descrambled_data_i(rd_descrambled_data)
     );
 
   ////////////////////////
   // program pipeline
   ////////////////////////
 
-  // Below code is temporary and does not account for scrambling
-  logic [DataWidth-1:0] prog_data;
+  logic [DataWidth-1:0] prog_data, prog_scrambled_data;
   logic flash_prog_req;
+  logic prog_calc_req;
+  logic prog_op_req;
 
   if (WidthMultiple == 1) begin : gen_single_prog_data
     assign flash_prog_req = reqs[PhyProg];
@@ -247,10 +271,17 @@
       .clk_i,
       .rst_ni,
       .req_i(reqs[PhyProg]),
+      .scramble_i(scramble_en_i),
       .sel_i(addr_i[0 +: WordSelW]),
       .data_i(prog_data_i),
       .last_i(prog_last_i),
       .ack_i(ack),
+      .calc_ack_i(calc_ack),
+      .scramble_ack_i(op_ack),
+      .mask_i(scramble_mask),
+      .scrambled_data_i(prog_scrambled_data),
+      .calc_req_o(prog_calc_req),
+      .scramble_req_o(prog_op_req),
       .req_o(flash_prog_req),
       .ack_o(prog_ack),
       .data_o(prog_data)
@@ -262,6 +293,28 @@
   // scrambling / de-scrambling primitive
   ////////////////////////
 
+  logic [BankAddrW-1:0] scramble_muxed_addr;
+  assign scramble_muxed_addr = prog_calc_req ? muxed_addr[BusBankAddrW-1:LsbAddrBit] :
+                                               rd_calc_addr;
+
+  flash_phy_scramble u_scramble (
+    .clk_i,
+    .rst_ni,
+    .calc_req_i(prog_calc_req | rd_calc_req),
+    .op_req_i(prog_op_req | rd_op_req),
+    .op_type_i(prog_op_req ? ScrambleOp : DeScrambleOp),
+    .addr_i(scramble_muxed_addr),
+    .plain_data_i(prog_data),
+    .scrambled_data_i(rd_scrambled_data),
+    .addr_key_i(addr_key_i),
+    .data_key_i(data_key_i),
+    .calc_ack_o(calc_ack),
+    .op_ack_o(op_ack),
+    .mask_o(scramble_mask),
+    .plain_data_o(rd_descrambled_data),
+    .scrambled_data_o(prog_scrambled_data)
+  );
+
 
   ////////////////////////
   // Actual connection to flash phy
diff --git a/hw/ip/flash_ctrl/rtl/flash_phy_pkg.sv b/hw/ip/flash_ctrl/rtl/flash_phy_pkg.sv
index 1ed95ce..6da6b6e 100644
--- a/hw/ip/flash_ctrl/rtl/flash_phy_pkg.sv
+++ b/hw/ip/flash_ctrl/rtl/flash_phy_pkg.sv
@@ -35,6 +35,13 @@
   parameter int LsbAddrBit    = $clog2(WidthMultiple);
   parameter int WordSelW      = WidthMultiple == 1 ? 1 : LsbAddrBit;
 
+  // scramble / de-scramble parameters
+  // Number of cycles the gf_mult is given to complete
+  parameter int KeySize       = 128;
+  parameter int GfMultCycles  = 2;
+  // If this value is greater than 1, constraints must be updated for multicycle paths
+  parameter int CipherCycles  = 2;
+
   // Read buffer metadata
   typedef enum logic [1:0] {
     Invalid     = 2'h0,
@@ -57,6 +64,11 @@
 
   parameter int RspOrderFifoWidth = $bits(rsp_fifo_entry_t);
 
+  typedef struct packed {
+    logic [BankAddrW-1:0] addr;
+    logic descramble;
+  } rd_attr_t;
+
   // Flash Operations Supported
   typedef enum logic [2:0] {
     PhyRead      = 3'h0,
@@ -73,4 +85,9 @@
     Ctrl         = 2'h2
   } flash_phy_op_sel_e;
 
+  typedef enum logic {
+    ScrambleOp   = 1'b0,
+    DeScrambleOp = 1'b1
+  } cipher_ops_e;
+
 endpackage // flash_phy_pkg
diff --git a/hw/ip/flash_ctrl/rtl/flash_phy_prog.sv b/hw/ip/flash_ctrl/rtl/flash_phy_prog.sv
index 68d8392..d64cf8f 100644
--- a/hw/ip/flash_ctrl/rtl/flash_phy_prog.sv
+++ b/hw/ip/flash_ctrl/rtl/flash_phy_prog.sv
@@ -26,10 +26,17 @@
   input clk_i,
   input rst_ni,
   input req_i,
+  input scramble_i,
   input [WordSelW-1:0] sel_i,
   input [BusWidth-1:0] data_i,
   input last_i,
   input ack_i,
+  input calc_ack_i,
+  input scramble_ack_i,
+  input [DataWidth-1:0] mask_i,
+  input [DataWidth-1:0] scrambled_data_i,
+  output logic calc_req_o,
+  output logic scramble_req_o,
   output logic req_o,
   output logic ack_o,
   output logic [DataWidth-1:0] data_o
@@ -40,7 +47,9 @@
     StPrePack,
     StPackData,
     StPostPack,
-    StWaitFlash
+    StWaitFlash,
+    StCalcMask,
+    StScrambleData
   } prog_state_e;
 
   typedef enum logic [1:0] {
@@ -99,6 +108,8 @@
     data_sel = Filler;
     req_o = 1'b0;
     ack_o = 1'b0;
+    calc_req_o = 1'b0;
+    scramble_req_o = 1'b0;
 
     unique case (state_q)
       StIdle: begin
@@ -124,7 +135,7 @@
 
         if (req_i && idx == (WidthMultiple-1)) begin
           // last beat of a flash word
-          state_d = StWaitFlash;
+          state_d = scramble_i ? StCalcMask : StWaitFlash;
         end else if (req_i && last_i) begin
           // last beat is not aligned with the last entry of flash word
           state_d = StPostPack;
@@ -140,6 +151,22 @@
 
         // finish packing remaining entries
         if (idx == (WidthMultiple-1)) begin
+          state_d = scramble_i ? StCalcMask : StWaitFlash;
+        end
+      end
+
+      StCalcMask: begin
+        calc_req_o = 1'b1;
+
+        if (calc_ack_i) begin
+          state_d = StScrambleData;
+        end
+      end
+
+      StScrambleData: begin
+        scramble_req_o = 1'b1;
+
+        if (scramble_ack_i) begin
           state_d = StWaitFlash;
         end
       end
@@ -157,16 +184,27 @@
     endcase // unique case (state_q)
   end
 
+  logic [DataWidth-1:0] mask_q;
+
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
       packed_data <= '0;
+      mask_q <= '0;
     end else if (req_o && ack_i) begin
       packed_data <= '0;
+    end else if (calc_req_o && calc_ack_i) begin
+      packed_data <= packed_data ^ mask_i;
+      mask_q <= mask_i;
+    end else if (scramble_req_o && scramble_ack_i) begin
+      packed_data <= scrambled_data_i ^ mask_q;
     end else if (pack_valid) begin
       packed_data[idx] <= pack_data;
     end
   end
 
+
+
+
   assign data_o = packed_data;
 
 
diff --git a/hw/ip/flash_ctrl/rtl/flash_phy_rd.sv b/hw/ip/flash_ctrl/rtl/flash_phy_rd.sv
index 1054e4b..4ebee35 100644
--- a/hw/ip/flash_ctrl/rtl/flash_phy_rd.sv
+++ b/hw/ip/flash_ctrl/rtl/flash_phy_rd.sv
@@ -13,18 +13,18 @@
 // upstream to stop issuing instructions, however once issued, the upstream will
 // always accept the response.
 //
-// TBD: Add support for descramble stage
-// The allocate and descramble indication received at read stage must be saved.
+// Support for descramble stage
+// The allocate and descramble indication received at read stage are saved.
 // When the read completes, depending on the 'descramble' indication saved, the
 // data is either stored into FIFO (reg + skid) between read and descramble stage,
 // or forwarded directly to the buffers (no de-scramble)
 //
-// If the storage element between read and de-scramble stages are completely fully
-// for some reason, then the read stage cannot start
+// If the storage element between read and de-scramble stages are completely full
+// for any reason, then the read stage cannot start.
 //
-// When the read stage begins, the galois multiply portion of the de-scramble should
+// When the read stage begins, the galois multiply portion of the de-scramble is
 // also be kicked off. When the galois multiply stage AND read stage completes, the
-// de-scramble is also kicked off (which is really what the de-scramble stage is doing).
+// de-scramble is then kicked off.
 
 module flash_phy_rd import flash_phy_pkg::*; (
   input clk_i,
@@ -32,6 +32,7 @@
 
   // interface with arbitration unit
   input req_i,
+  input descramble_i,
   input prog_i,
   input pg_erase_i,
   input bk_erase_i,
@@ -42,6 +43,16 @@
   output logic [BusWidth-1:0] data_o,
   output logic idle_o, // the entire read pipeline is idle
 
+  // interface with scramble unit
+  output logic calc_req_o,
+  output logic descramble_req_o,
+  output logic [BankAddrW-1:0] calc_addr_o,
+  output logic [DataWidth-1:0] scrambled_data_o,
+  input calc_ack_i,
+  input descramble_ack_i,
+  input [DataWidth-1:0] mask_i,
+  input [DataWidth-1:0] descrambled_data_i,
+
   // interface to actual flash primitive
   output logic req_o,
   input ack_i,
@@ -52,6 +63,12 @@
   // Read buffers
   /////////////////////////////////
 
+  // muxed de-scrambled and plain-data
+  logic [DataWidth-1:0] muxed_data;
+
+  // muxed data valid signal that takes scrambling into consideration
+  logic data_valid;
+
   // A buffer allocate is invoked when a new transaction arrives.
   // Alloc only happens if the new transaction does not match an existing entry.
   logic [NumBuf-1:0] alloc;
@@ -166,7 +183,7 @@
   // update sets state to valid
   // wipe sets state to invalid - this comes from prog
   for (genvar i = 0; i < NumBuf; i++) begin: gen_bufs
-    flash_phy_rd_buffers i_rd_buf (
+    flash_phy_rd_buffers u_rd_buf (
       .clk_i,
       .rst_ni,
       .alloc_i(rdy_o & alloc[i]),
@@ -174,7 +191,7 @@
       .wipe_i(data_hazard[i]),
       .addr_i(flash_word_addr),
       .part_i(part_i),
-      .data_i(data_i),
+      .data_i(muxed_data),
       .out_o(read_buf[i])
     );
   end
@@ -203,7 +220,13 @@
   logic rd_busy;
   logic rd_done;
   logic [NumBuf-1:0] alloc_q;
+  rd_attr_t rd_attrs;
 
+  // scramble stage ready
+  logic scramble_stage_rdy;
+
+  // read done does not mean data is available.
+  // if the data must be de-scrambled, there is another wait stage
   assign rd_done = rd_busy & ack_i;
 
   // if buffer allocated, that is the return source
@@ -239,10 +262,13 @@
     if (!rst_ni) begin
       rd_busy <= 1'b0;
       alloc_q <= '0;
+      rd_attrs <= '0;
     end else if (req_o) begin
       // read only becomes busy if a buffer is allocated and read
       rd_busy <= 1'b1;
       alloc_q <= alloc;
+      rd_attrs.addr <= addr_i[BusBankAddrW-1:LsbAddrBit];
+      rd_attrs.descramble <= descramble_i;
     end else if (rd_done) begin
       rd_busy <= 1'b0;
     end
@@ -254,7 +280,8 @@
 
   // if no buffers matched, accept only if read state is idle and there is space
   // if buffer is matched, accept as long as there is space in the rsp fifo
-  assign rdy_o = no_match ? rd_stage_idle & rsp_fifo_rdy : rsp_fifo_rdy;
+  assign rdy_o = no_match ? rd_stage_idle & rsp_fifo_rdy & scramble_stage_rdy :
+                            rsp_fifo_rdy & scramble_stage_rdy;
 
   // issue a transaction to flash
   assign req_o = req_i & rdy_o & no_match;
@@ -263,7 +290,111 @@
   // De-scrambling stage
   /////////////////////////////////
 
-  // nothing here yet
+  logic fifo_data_ready;
+  logic fifo_data_valid;
+  logic mask_valid;
+  logic [DataWidth-1:0] fifo_data;
+  logic [DataWidth-1:0] mask;
+  logic data_fifo_rdy;
+  logic mask_fifo_rdy;
+  logic forward;
+  logic hint_forward;
+  logic hint_descram;
+  logic [NumBuf-1:0] alloc_q2;
+
+  assign scramble_stage_rdy = data_fifo_rdy & mask_fifo_rdy;
+
+  // data is consumed when:
+  // 1. When descrambling completes
+  // 2. Immediately consumed when descrambling not required
+  // 3. In both cases, when data has not already been forwarded
+  assign fifo_data_ready = hint_descram ? descramble_req_o & descramble_ack_i & ~hint_forward :
+                                          fifo_data_valid & !hint_forward;
+
+  // data is forwarded whenever it does not require descrambling or if it has been erased
+  // but forwarding is only possible if there are no entries in the FIFO to ensure the current
+  // read cannot run ahead of the descramble.
+  assign forward = rd_done & !fifo_data_valid &
+                   ((data_i == {DataWidth{1'b1}}) | !rd_attrs.descramble);
+
+  // storage for read outputs
+  // This storage element can be fully merged with the fifo below if the time it takes
+  // to do a read is matched to gf_mult.  This is doable and should be considered.
+  // However it would create a dependency on constraints (multicycle) instead of
+  // being correct by construction.
+  //
+  // In addition to potential different completion times, the mask storage may also
+  // be pushed even if it is not required (erase case). The solution for this issue
+  // is that the mask / data are always pushed, it is then selectively popped based
+  // on the forward / de-scrambling hints.
+  //
+  // All these problems could be resolved if the timings matched exactly, however
+  // the user would need to correctly setup constraints on either flash / gf_mult
+  // timing change.
+  prim_fifo_sync #(
+    .Width  (DataWidth + 2 + NumBuf),
+    .Pass   (0),
+    .Depth  (2)
+  ) u_rd_storage (
+    .clk_i,
+    .rst_ni,
+    .clr_i  (1'b0),
+    .wvalid (rd_done),
+    .wready (data_fifo_rdy),
+    .wdata  ({alloc_q, rd_attrs.descramble,forward,data_i}),
+    .depth  (),
+    .rvalid (fifo_data_valid),
+    .rready (fifo_data_ready | hint_forward),
+    .rdata  ({alloc_q2, hint_descram,hint_forward,fifo_data})
+  );
+
+  // storage for mask calculations
+  prim_fifo_sync #(
+      .Width  (DataWidth),
+      .Pass   (0),
+      .Depth  (2)
+  ) u_mask_storage (
+    .clk_i,
+    .rst_ni,
+    .clr_i  (1'b0),
+    .wvalid (calc_req_o & calc_ack_i),
+    .wready (mask_fifo_rdy),
+    .wdata  (mask_i),
+    .depth  (),
+    .rvalid (mask_valid),
+    .rready (fifo_data_ready | hint_forward),
+    .rdata  (mask)
+  );
+
+  // generate the mask calculation request
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      calc_req_o <= '0;
+    end else if (req_o && descramble_i) begin
+      calc_req_o <= 1'b1;
+    end else if (calc_req_o && calc_ack_i) begin
+      calc_req_o <= 1'b0;
+    end
+  end
+
+  // operand to gf_mult
+  assign calc_addr_o = rd_attrs.addr;
+
+  // generate the descramble request whenever both stages are available
+  // and there is a need to descramble
+  assign descramble_req_o = fifo_data_valid & mask_valid & !hint_forward;
+
+  // scrambled data to de-scramble
+  assign scrambled_data_o = fifo_data ^ mask;
+
+  // muxed data
+  assign muxed_data = hint_descram ? descrambled_data_i ^ mask : data_i;
+
+  // muxed data valid
+  // if no de-scramble required, return data on read complete
+  // if data is all empty (erased), also return data on read complete
+  // if descramble is required, return data when descrambler finishes
+  assign data_valid = forward | fifo_data_ready;
 
 
   /////////////////////////////////
@@ -275,10 +406,15 @@
   logic [DataWidth-1:0] buf_rsp_data;
 
   // update buffers
-  assign update = rd_done ? alloc_q : '0;
+  // When forwarding, update entry stored in alloc_q
+  // When de-scrambling however, the contents of alloc_q may have already updated to the next read,
+  // so a different pointer is used.
+  // assign update = data_valid ? alloc_q : '0;
+  assign update = forward         ? alloc_q  :
+                  fifo_data_ready ? alloc_q2 : '0;
 
   // match in flash response when allocated buffer is the same as top of response fifo
-  assign flash_rsp_match = rsp_fifo_vld & rd_done & (rsp_fifo_rdata.buf_sel == alloc_q);
+  assign flash_rsp_match = rsp_fifo_vld & data_valid & (rsp_fifo_rdata.buf_sel == update);
 
   // match in buf response when there is a valid buffer that is the same as top of response fifo
   for (genvar i = 0; i < NumBuf; i++) begin: gen_buf_rsp_match
@@ -287,7 +423,7 @@
 
   // select among the buffers
   always_comb begin
-    buf_rsp_data = data_i;
+    buf_rsp_data = muxed_data;
     for (int i = 0; i < NumBuf; i++) begin
       if (buf_rsp_match[i]) begin
         buf_rsp_data = read_buf[i].data;
@@ -298,21 +434,20 @@
   if (WidthMultiple == 1) begin : gen_width_one_rd
     // When multiple is 1, just pass the read through directly
     logic unused_word_sel;
-    assign data_o = |buf_rsp_match ? buf_rsp_data : data_i;
+    assign data_o = |buf_rsp_match ? buf_rsp_data : muxed_data;
     assign unused_word_sel = rsp_fifo_rdata.word_sel;
 
   end else begin : gen_rd
     // Re-arrange data into packed array to pick the correct one
     logic [WidthMultiple-1:0][BusWidth-1:0] bus_words_packed;
-    assign bus_words_packed = |buf_rsp_match ? buf_rsp_data : data_i;
+    assign bus_words_packed = |buf_rsp_match ? buf_rsp_data : muxed_data;
     assign data_o = bus_words_packed[rsp_fifo_rdata.word_sel];
 
   end
 
   assign data_valid_o = flash_rsp_match | |buf_rsp_match;
 
-
-  // the entire read pipeline is idle when there are no responses to return
+  // the entire read pipeline is idle when there are no responses to return and no
   assign idle_o = ~rsp_fifo_vld;
 
   /////////////////////////////////
diff --git a/hw/ip/flash_ctrl/rtl/flash_phy_scramble.sv b/hw/ip/flash_ctrl/rtl/flash_phy_scramble.sv
new file mode 100644
index 0000000..1796e01
--- /dev/null
+++ b/hw/ip/flash_ctrl/rtl/flash_phy_scramble.sv
@@ -0,0 +1,82 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Flash Phy Scramble Module
+//
+// This module implements the flash scramble / de-scramble operation
+// This operation is actually XEX.  However the components are broken
+// in two and separately manipulated by the program and read pipelines.
+//
+
+module flash_phy_scramble import flash_phy_pkg::*; (
+  input clk_i,
+  input rst_ni,
+  input calc_req_i, // calculate galois multiplier mask
+  input op_req_i,   // request primitive operation
+  input cipher_ops_e op_type_i,  // sramble or de-scramble
+  input [BankAddrW-1:0] addr_i,
+  input [DataWidth-1:0] plain_data_i,
+  input [DataWidth-1:0] scrambled_data_i,
+  input [KeySize-1:0] addr_key_i,
+  input [KeySize-1:0] data_key_i,
+  output logic calc_ack_o,
+  output logic op_ack_o,
+  output logic [DataWidth-1:0] mask_o,
+  output logic [DataWidth-1:0] plain_data_o,
+  output logic [DataWidth-1:0] scrambled_data_o
+);
+
+  localparam int AddrPadWidth = DataWidth - BankAddrW;
+  localparam int UnusedWidth = KeySize - AddrPadWidth;
+
+  // unused portion of addr_key
+  logic [UnusedWidth-1:0] unused_key;
+  assign unused_key = addr_key_i[KeySize-1 -: UnusedWidth];
+
+  // Galois Multiply portion
+  prim_gf_mult # (
+    .Width(DataWidth),
+    .StagesPerCycle(DataWidth / GfMultCycles)
+  ) u_mult (
+    .clk_i,
+    .rst_ni,
+    .req_i(calc_req_i),
+    .operand_a_i({addr_key_i[DataWidth +: AddrPadWidth], addr_i}),
+    .operand_b_i(addr_key_i[DataWidth-1:0]),
+    .ack_o(calc_ack_o),
+    .prod_o(mask_o)
+  );
+
+  // Cipher portion
+  logic dec;
+  logic [DataWidth-1:0] data;
+
+  assign dec = op_type_i == DeScrambleOp;
+
+  // Previous discussion settled on PRESENT, using PRINCE here for now
+  // just to get some area idea
+  prim_prince # (
+    .DataWidth(DataWidth),
+    .KeyWidth(KeySize),
+    .UseOldKeySched(1'b1),
+    .HalfwayDataReg(1'b1)
+  ) u_cipher (
+    .clk_i,
+    .rst_ni,
+    .valid_i(op_req_i),
+    .data_i(dec ? scrambled_data_i : plain_data_i),
+    .key_i(data_key_i),
+    .dec_i(dec),
+    .data_o(data),
+    .valid_o(op_ack_o)
+  );
+
+  // if decrypt, output the unscrambled data, feed input through otherwise
+  assign plain_data_o = dec ? data : scrambled_data_i;
+
+  // if encrypt, output the scrambled data, feed input through otherwise
+  assign scrambled_data_o = dec ? plain_data_i : data;
+
+
+endmodule // flash_phy_scramble
diff --git a/sw/device/tests/flash_ctrl_test.c b/sw/device/tests/flash_ctrl_test.c
index a09d9d0..9b003d2 100644
--- a/sw/device/tests/flash_ctrl_test.c
+++ b/sw/device/tests/flash_ctrl_test.c
@@ -76,6 +76,12 @@
                        output_page));
   CHECK_ARRAYS_EQ(output_page, input_page, FLASH_WORDS_PER_PAGE);
 
+  // Check from host side also
+  for (int i = 0; i < FLASH_WORDS_PER_PAGE; i++) {
+    output_page[i] = mmio_region_read32(flash_bank_1, i * sizeof(uint32_t));
+  }
+  CHECK_ARRAYS_EQ(output_page, input_page, FLASH_WORDS_PER_PAGE);
+
   // Similar check for info page
   CHECK_EQZ(flash_page_erase(flash_bank_1_addr, kInfoPartition));
   CHECK_EQZ(flash_write(flash_bank_1_addr, kInfoPartition, input_page,