[dv/common] Add non_blocking access in tl_access

Change "outstanding_csr_accesses" to "outstanding_accesses" to support
general outstanding accesses (such as from tl_access)
Implement functions and tasks to increment and decrement
outstanding_accesses and wait access to 0
Add non-blocking cases for tl_access
diff --git a/hw/dv/sv/cip_lib/cip_base_vseq.sv b/hw/dv/sv/cip_lib/cip_base_vseq.sv
index ccacd9a..c876892 100644
--- a/hw/dv/sv/cip_lib/cip_base_vseq.sv
+++ b/hw/dv/sv/cip_lib/cip_base_vseq.sv
@@ -25,7 +25,7 @@
 
   `uvm_object_new
   // knobs to disable post_start clear interrupt routine
-  bit do_clear_all_interrupts = 1'b1;
+  bit  do_clear_all_interrupts = 1'b1;
   // csr queue for intr test/enable/state
   dv_base_reg intr_test_csrs[$];
   dv_base_reg intr_state_csrs[$];
@@ -51,7 +51,7 @@
 
   // tl_access task: does a single TL_W-bit write or read transaction to the specified address
   // note that this task does not update ral model; optionally also checks for error response
-  // TODO: add additional args? non-blocking? respose data check? timeout? spinwait?
+  // TODO: add additional args? respose data check? spinwait?
   // TODO: randomize size, addr here based on given addr range, data, and mask, eventually can be
   // reused for mem_read, partial read, and hmac msg fifo write
   virtual task tl_access(input bit [TL_AW-1:0]  addr,
@@ -60,31 +60,54 @@
                          input bit [TL_DBW-1:0] mask = '1,
                          input bit [TL_SZW-1:0] size = 2,
                          input bit              check_rsp = 1'b1,
-                         input bit              exp_err_rsp = 1'b0);
-    tl_host_single_seq  tl_seq;
-    `uvm_create_on(tl_seq, p_sequencer.tl_sequencer_h)
-    outstanding_csr_accesses++;
-    if (cfg.zero_delays) begin
-      tl_seq.min_req_delay = 0;
-      tl_seq.max_req_delay = 0;
+                         input bit              exp_err_rsp = 1'b0,
+                         input bit              blocking = csr_utils_pkg::default_csr_blocking);
+    if (blocking) begin
+      tl_access_sub(addr, write, data, mask, size, check_rsp, exp_err_rsp);
+    end else begin
+      fork
+        tl_access_sub(addr, write, data, mask, size, check_rsp, exp_err_rsp);
+      join_none
+      // Add #0 to ensure that this thread starts executing before any subsequent call
+      #0;
     end
-    `DV_CHECK_RANDOMIZE_WITH_FATAL(tl_seq,
-        addr      == local::addr;
-        size      == local::size;
-        mask      == local::mask;
-        if (write) {
-          if ($countones(mask) < (1 << size)) opcode == tlul_pkg::PutPartialData;
-          else opcode inside {tlul_pkg::PutPartialData, tlul_pkg::PutFullData};
-          data    == local::data;
-        } else {
-          opcode  == tlul_pkg::Get;
-        })
-    `uvm_send(tl_seq)
-    if (!write) data = tl_seq.rsp.d_data;
-    if (check_rsp) begin
-      `DV_CHECK_EQ(tl_seq.rsp.d_error, exp_err_rsp, "unexpected error response")
-    end
-    outstanding_csr_accesses--;
+  endtask
+
+  virtual task tl_access_sub(input bit [TL_AW-1:0]  addr,
+                             input bit              write,
+                             inout bit [TL_DW-1:0]  data,
+                             input bit [TL_DBW-1:0] mask = '1,
+                             input bit [TL_SZW-1:0] size = 2,
+                             input bit              check_rsp = 1'b1,
+                             input bit              exp_err_rsp = 1'b0);
+    `DV_SPINWAIT(
+        // thread to read/write tlul
+        tl_host_single_seq  tl_seq;
+        `uvm_create_on(tl_seq, p_sequencer.tl_sequencer_h)
+        if (cfg.zero_delays) begin
+        tl_seq.min_req_delay = 0;
+        tl_seq.max_req_delay = 0;
+        end
+        csr_utils_pkg::increment_outstanding_access();
+        `DV_CHECK_RANDOMIZE_WITH_FATAL(tl_seq,
+            addr      == local::addr;
+            size      == local::size;
+            mask      == local::mask;
+              if (write) {
+                if ($countones(mask) < (1 << size)) opcode == tlul_pkg::PutPartialData;
+                else opcode inside {tlul_pkg::PutPartialData, tlul_pkg::PutFullData};
+                data    == local::data;
+                } else {
+                  opcode  == tlul_pkg::Get;
+                })
+        `uvm_send(tl_seq)
+        if (!write) data = tl_seq.rsp.d_data;
+        if (check_rsp) begin
+          `DV_CHECK_EQ(tl_seq.rsp.d_error, exp_err_rsp, "unexpected error response")
+        end
+        csr_utils_pkg::decrement_outstanding_access();,
+        // thread to check timeout
+        $sformatf("Timeout waiting tl_access : addr=0x%0h", addr))
   endtask
 
   virtual task tl_access_unmapped_addr();
diff --git a/hw/dv/sv/csr_utils/csr_seq_lib.sv b/hw/dv/sv/csr_utils/csr_seq_lib.sv
index 92764f1..02cf841 100644
--- a/hw/dv/sv/csr_utils/csr_seq_lib.sv
+++ b/hw/dv/sv/csr_utils/csr_seq_lib.sv
@@ -59,7 +59,7 @@
   // post_start
   virtual task post_start();
     super.post_start();
-    wait(outstanding_csr_accesses == 0);
+    wait_no_outstanding_access();
     test_csrs.delete();
   endtask
 
@@ -451,7 +451,7 @@
                      .compare_vs_ral(1'b1),
                      .compare_mask  (compare_mask));
       end
-      wait(outstanding_csr_accesses == 0);
+      wait_no_outstanding_access();
     end
   endtask
 
diff --git a/hw/dv/sv/csr_utils/csr_utils_pkg.sv b/hw/dv/sv/csr_utils/csr_utils_pkg.sv
index 03fda92..6781e9e 100644
--- a/hw/dv/sv/csr_utils/csr_utils_pkg.sv
+++ b/hw/dv/sv/csr_utils/csr_utils_pkg.sv
@@ -12,7 +12,7 @@
   `include "dv_macros.svh"
 
   // local types and variables
-  uint       outstanding_csr_accesses    = 0;
+  uint       outstanding_accesses        = 0;
   uint       default_timeout_ns          = 1_000_000; // 1ms
   uint       default_spinwait_timeout_ns = 10_000_000; // 10ms
   string     msg_id                      = "csr_utils";
@@ -36,6 +36,18 @@
     CsrExclAll        = 3'b111  // exclude csr from init or write or writ-read check
   } csr_excl_type_e;
 
+  function automatic void increment_outstanding_access();
+    outstanding_accesses++;
+  endfunction
+
+  function automatic void decrement_outstanding_access();
+    outstanding_accesses--;
+  endfunction
+
+  task automatic wait_no_outstanding_access();
+    wait(outstanding_accesses == 0);
+  endtask
+
   // Get all valid csr addrs - useful to check if incoming addr falls in the csr range.
   function automatic void get_csr_addrs(input uvm_reg_block ral, ref uvm_reg_addr_t csr_addrs[$]);
     uvm_reg csrs[$];
@@ -167,12 +179,12 @@
 
         fork
           begin
-            outstanding_csr_accesses++;
+            increment_outstanding_access();
             csr.update(.status(status), .path(path), .map(map));
             if (check == UVM_CHECK) begin
               `DV_CHECK_EQ(status, UVM_IS_OK, "", error, msg_id)
             end
-            outstanding_csr_accesses--;
+            decrement_outstanding_access();
           end
           begin
             wait_timeout(timeout_ns, msg_id,
@@ -217,12 +229,12 @@
 
         fork
           begin
-            outstanding_csr_accesses++;
+            increment_outstanding_access();
             csr.write(.status(status), .value(value), .path(path), .map(map));
             if (check == UVM_CHECK) begin
               `DV_CHECK_EQ(status, UVM_IS_OK, "", error, msg_id)
             end
-            outstanding_csr_accesses--;
+            decrement_outstanding_access();
           end
           begin
             wait_timeout(timeout_ns, msg_id,
@@ -268,7 +280,7 @@
 
         fork
           begin
-            outstanding_csr_accesses++;
+            increment_outstanding_access();
             csr_or_fld = decode_csr_or_field(ptr);
             if (csr_or_fld.field != null) begin
               csr_or_fld.field.read(.status(status), .value(value), .path(path), .map(map));
@@ -278,7 +290,7 @@
             if (check == UVM_CHECK) begin
               `DV_CHECK_EQ(status, UVM_IS_OK, "", error, msg_id)
             end
-            outstanding_csr_accesses--;
+            decrement_outstanding_access();
           end
           begin
             wait_timeout(timeout_ns, msg_id,
@@ -312,7 +324,7 @@
             uvm_reg_data_t  exp;
             string          msg_id = {csr_utils_pkg::msg_id, "::csr_rd_check"};
 
-            outstanding_csr_accesses++;
+            increment_outstanding_access();
             csr_or_fld = decode_csr_or_field(ptr);
             // get mirrored value before the read
             if (csr_or_fld.field != null) begin
@@ -327,7 +339,7 @@
               exp = (compare_vs_ral ? exp : compare_value) & compare_mask;
               `DV_CHECK_EQ(obs, exp, err_msg, error, msg_id)
             end
-            outstanding_csr_accesses--;
+            decrement_outstanding_access();
           end
         join_none
         if (blocking) wait fork;
@@ -418,12 +430,12 @@
 
         fork
           begin
-            outstanding_csr_accesses++;
+            increment_outstanding_access();
             ptr.read(.status(status), .offset(offset), .value(data), .map(map));
             if (check == UVM_CHECK) begin
               `DV_CHECK_EQ(status, UVM_IS_OK, "", error, msg_id)
             end
-            outstanding_csr_accesses--;
+            decrement_outstanding_access();
           end
           begin : mem_rd_timeout
             wait_timeout(timeout_ns, msg_id,
@@ -467,12 +479,12 @@
 
         fork
           begin
-            outstanding_csr_accesses++;
+            increment_outstanding_access();
             ptr.write(.status(status), .offset(offset), .value(data), .map(map));
             if (check == UVM_CHECK) begin
               `DV_CHECK_EQ(status, UVM_IS_OK, "", error, msg_id)
             end
-            outstanding_csr_accesses--;
+            decrement_outstanding_access();
           end
           begin
             wait_timeout(timeout_ns, msg_id,
diff --git a/hw/dv/sv/dv_lib/dv_base_vseq.sv b/hw/dv/sv/dv_lib/dv_base_vseq.sv
index bafdffa..86ef069 100644
--- a/hw/dv/sv/dv_lib/dv_base_vseq.sv
+++ b/hw/dv/sv/dv_lib/dv_base_vseq.sv
@@ -72,7 +72,7 @@
 
   // dut shutdown - this is called in post_start if do_dut_shutdown bit is set
   virtual task dut_shutdown();
-    // TODO(sriyerg): wait for pending items in tl agent to clear up
+    csr_utils_pkg::wait_no_outstanding_access();
   endtask
 
   // function to add csr exclusions of the given type using the csr_excl_item item
diff --git a/hw/ip/hmac/dv/env/hmac_scoreboard.sv b/hw/ip/hmac/dv/env/hmac_scoreboard.sv
index e1f48b9..5e36410 100644
--- a/hw/ip/hmac/dv/env/hmac_scoreboard.sv
+++ b/hw/ip/hmac/dv/env/hmac_scoreboard.sv
@@ -57,7 +57,10 @@
           end
           foreach(msg[i])  begin
             msg_q.push_back(msg[i]);
-            if (msg_q.size() % 4 == 0) incr_wr_and_check_fifo_full();
+            if (msg_q.size() % 4 == 0) begin
+              wait(cfg.tlul_assert_vif.d2h.a_ready == 1); // wait for outstanding transaction
+              incr_wr_and_check_fifo_full();
+            end
           end
         end
       end else begin
@@ -149,12 +152,18 @@
   endfunction
 
   virtual task incr_wr_and_check_fifo_full();
+    @(negedge cfg.clk_rst_vif.clk);
     hmac_wr_cnt ++;
     if ((hmac_wr_cnt - hmac_rd_cnt) == HMAC_MSG_FIFO_DEPTH) begin
       ral.intr_state.fifo_full.predict(1);
     end
   endtask
 
+  // internal msg_fifo model to check fifo status and interrupt.
+  // monitor rd_cnt and wr_cnt on the negedge of the clk
+  // rd_cnt followed by wr_cnt with a clk cycle delay, except:
+  // 1). hmac process key: DUT will process the key first
+  // 2). read cnt reaches FIFO_MAX: DUT will process msg in the FIFO
   virtual task hmac_process_fifo_rd();
     bit key_processed;
     fork
@@ -167,6 +176,7 @@
               if (ral.cfg.hmac_en.get_mirrored_value() && hmac_rd_cnt == 0) begin
                 // 80 cycles for hmac key padding, 1 cycle for hash_start reg to reset
                 cfg.clk_rst_vif.wait_clks(HMAC_KEY_PROCESS_CYCLES + 1);
+                @(negedge cfg.clk_rst_vif.clk);
                 key_processed = 1;
               end
               while (1) begin
@@ -192,10 +202,10 @@
               if (ral.cfg.hmac_en.get_mirrored_value() && hmac_rd_cnt == 0) begin
                 wait(key_processed);
               end
-              cfg.clk_rst_vif.wait_clks(1);
-              @(negedge cfg.clk_rst_vif.clk);
+              #1; // delay 1 ns to make sure did not sample right at negedge clk
+              cfg.clk_rst_vif.wait_n_clks(1);
               hmac_rd_cnt ++;
-              if (hmac_rd_cnt % 16 == 0) cfg.clk_rst_vif.wait_clks(HMAC_MSG_PROCESS_CYCLES);
+              if (hmac_rd_cnt % 16 == 0) cfg.clk_rst_vif.wait_n_clks(HMAC_MSG_PROCESS_CYCLES);
             end
             begin : reset_hmac_fifo_rd
               wait(!cfg.clk_rst_vif.rst_n);
diff --git a/hw/ip/hmac/dv/env/seq_lib/hmac_base_vseq.sv b/hw/ip/hmac/dv/env/seq_lib/hmac_base_vseq.sv
index 1b7d81f..9a32a7b 100644
--- a/hw/ip/hmac/dv/env/seq_lib/hmac_base_vseq.sv
+++ b/hw/ip/hmac/dv/env/seq_lib/hmac_base_vseq.sv
@@ -161,11 +161,14 @@
       word = {>>byte{word_unpack}};
       `uvm_info(`gfn, $sformatf("wr_size = %0h, wr_addr = %0h, wr_mask = %0h, words = 0x%0h",
                                 wr_size, wr_addr, wr_mask, word), UVM_HIGH)
-      tl_access(.addr(wr_addr), .write(1'b1), .data(word), .mask(wr_mask), .size(wr_size));
+      tl_access(.addr(wr_addr), .write(1'b1), .data(word), .mask(wr_mask), .size(wr_size),
+                .blocking($urandom_range(0, 1)));
 
       if (!do_back_pressure) check_status_intr_fifo_full();
       else clear_intr_fifo_full();
     end
+    // ensure all msg fifo are written before trigger hmac_process
+    csr_utils_pkg::wait_no_outstanding_access();
   endtask
 
   // read fifo_depth reg and burst write a chunk of words
@@ -185,13 +188,15 @@
           `uvm_info(`gfn, $sformatf("wr_size = %0h, wr_addr = %0h, wr_mask = %0h, words = 0x%0h",
                                     wr_size, wr_addr, wr_mask, word), UVM_HIGH)
           `DV_CHECK_FATAL(randomize(wr_size, wr_addr, wr_mask) with {wr_size == 2;})
-          tl_access(.addr(wr_addr), .write(1'b1), .data(word), .mask(wr_mask), .size(wr_size));
+          tl_access(.addr(wr_addr), .write(1'b1), .data(word), .mask(wr_mask), .size(wr_size),
+                    .blocking($urandom_range(0, 1)));
         end
         clear_intr_fifo_full();
-     end else begin // remaining msg is smaller than the burst_wr_length
-       wr_msg(msg_q);
-       break;
-     end
+      end else begin // remaining msg is smaller than the burst_wr_length
+        wr_msg(msg_q);
+        break;
+      end
+    csr_utils_pkg::wait_no_outstanding_access();
     end
   endtask
 
diff --git a/hw/ip/hmac/dv/env/seq_lib/hmac_reset_vseq.sv b/hw/ip/hmac/dv/env/seq_lib/hmac_reset_vseq.sv
index 2257efa..d599075 100644
--- a/hw/ip/hmac/dv/env/seq_lib/hmac_reset_vseq.sv
+++ b/hw/ip/hmac/dv/env/seq_lib/hmac_reset_vseq.sv
@@ -40,7 +40,7 @@
         begin : reset
           `DV_CHECK_MEMBER_RANDOMIZE_FATAL(delay)
           cfg.clk_rst_vif.wait_clks(delay);
-          wait(outstanding_csr_accesses == 0); // TODO : temp wait, need support
+          csr_utils_pkg::wait_no_outstanding_access(); // TODO : temp wait, need support
           reset_flag = 1'b1;
         end
       join_any
diff --git a/hw/ip/rv_timer/dv/env/seq_lib/rv_timer_sanity_vseq.sv b/hw/ip/rv_timer/dv/env/seq_lib/rv_timer_sanity_vseq.sv
index fc58f8f..6c2b8d7 100644
--- a/hw/ip/rv_timer/dv/env/seq_lib/rv_timer_sanity_vseq.sv
+++ b/hw/ip/rv_timer/dv/env/seq_lib/rv_timer_sanity_vseq.sv
@@ -114,7 +114,7 @@
             if (assert_reset) begin
               `DV_CHECK_MEMBER_RANDOMIZE_FATAL(delay)
               cfg.clk_rst_vif.wait_clks(delay);
-              wait(outstanding_csr_accesses == 0);
+              csr_utils_pkg::wait_no_outstanding_access();
               apply_reset("HARD");
             end
           join_none