Revert "Revert "[usbdev] Fixes for I/O modes and expand their tests""

This reverts commit d2e1184308b9582551252d1135a126ea558a6ab8 which
itself reverts commit 66c509296798cdd9dbeba4deeb381d0cecf3b429.

The original commit was suspected to increase FPGA synthesis times by
2x and to cause many CI failures. With lowRISC/OpenTitan#3647 those
issues have been resolved which is why the original commit can be
re-applied.

Signed-off-by: Pirmin Vogel <vogelpi@lowrisc.org>
diff --git a/hw/dv/dpi/usbdpi/monitor_usb.c b/hw/dv/dpi/usbdpi/monitor_usb.c
index 778936c..a573cde 100644
--- a/hw/dv/dpi/usbdpi/monitor_usb.c
+++ b/hw/dv/dpi/usbdpi/monitor_usb.c
@@ -97,22 +97,43 @@
   log = (loglevel & 0x2);
   compact = (loglevel & 0x1);
 
-  if ((d2p & D2P_DP_EN) || (d2p & D2P_DN_EN)) {
+  if ((d2p & D2P_DP_EN) || (d2p & D2P_DN_EN) || (d2p & D2P_D_EN)) {
     if (hdrive) {
       fprintf(mon_file, "mon: %8d: Bus clash\n", tick);
     }
-    dp = ((d2p & D2P_DP_EN) && (d2p & D2P_DP)) ? 1 : 0;
-    dn = ((d2p & D2P_DN_EN) && (d2p & D2P_DN)) ? 1 : 0;
+    if (d2p & D2P_TXMODE_SE) {
+      dp = ((d2p & D2P_DP_EN) && (d2p & D2P_DP)) ? 1 : 0;
+      dn = ((d2p & D2P_DN_EN) && (d2p & D2P_DN)) ? 1 : 0;
+    } else {
+      if ((d2p & D2P_SE0) || !(d2p & D2P_D_EN)) {
+        dp = 0;
+        dn = 0;
+      } else {
+        dp = (d2p & D2P_D) ? 1 : 0;
+        dn = (d2p & D2P_D) ? 0 : 1;
+      }
+    }
     mon->driver = M_DEVICE;
   } else if (hdrive) {
-    dp = (p2d & P2D_DP) ? 1 : 0;
-    dn = (p2d & P2D_DN) ? 1 : 0;
+    if (d2p & D2P_TXMODE_SE) {
+      dp = (p2d & P2D_DP) ? 1 : 0;
+      dn = (p2d & P2D_DN) ? 1 : 0;
+    } else {
+      if (p2d & (P2D_DP | P2D_DN)) {
+        dp = (p2d & P2D_D) ? 1 : 0;
+        dn = (p2d & P2D_D) ? 0 : 1;
+      } else {
+        dp = 0;
+        dn = 0;
+      }
+    }
     mon->driver = M_HOST;
   } else {
     if ((mon->driver != M_NONE) || (mon->pu != (d2p & D2P_PU))) {
       if (log) {
         if (d2p & D2P_PU) {
-          fprintf(mon_file, "mon: %8d: Idle, FS resistor\n", tick);
+          fprintf(mon_file, "mon: %8d: Idle, FS resistor (d2p 0x%x)\n", tick,
+                  d2p);
         } else {
           fprintf(mon_file, "mon: %8d: Idle, SE0\n", tick);
         }
@@ -123,6 +144,12 @@
     mon->line = 0;
     return;
   }
+  // If the DN pullup is there then swap
+  if (d2p & D2P_DNPU) {
+    int tmp = dp;
+    dp = dn;
+    dn = tmp;
+  }
   mon->line = (mon->line << 2) | dp << 1 | dn;
 
   if (mon->state == MS_IDLE) {
diff --git a/hw/dv/dpi/usbdpi/usbdpi.c b/hw/dv/dpi/usbdpi/usbdpi.c
index 5348dd7..d9a4664 100644
--- a/hw/dv/dpi/usbdpi/usbdpi.c
+++ b/hw/dv/dpi/usbdpi/usbdpi.c
@@ -21,6 +21,14 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+// Historically the simulation started too fast to connect to all
+// the fifos and terminals without loss of output. So a delay was added.
+// Today the startup is slow enough this does not seem to be needed.
+// In case things change again Im going to leave this behind a define
+// for now, but if this continues not to be needed the code can be deleted.
+// Uncomment next line if you need the delay
+// #define NEED_SLEEP
+
 static const char *st_states[] = {"ST_IDLE 0", "ST_SEND 1", "ST_GET 2",
                                   "ST_SYNC 3", "ST_EOP 4",  "ST_EOP0 5"};
 
@@ -39,6 +47,7 @@
   ctx->frame = 0;
   ctx->framepend = 0;
   ctx->lastframe = 0;
+  ctx->last_pu = 0;
   ctx->inframe = 4;
   ctx->state = ST_IDLE;
   ctx->driving = 0;
@@ -86,13 +95,13 @@
   char raw_str[D2P_BITS + 1];
   {
     int i;
-    for (i = 0; i < 5; i++) {
-      raw_str[5 - i - 1] = !!(d2p & (1 << i)) + '0';
+    for (i = 0; i < D2P_BITS; i++) {
+      raw_str[D2P_BITS - i - 1] = d2p & (1 << i) ? '1' : '0';
     }
   }
   raw_str[D2P_BITS] = 0;
 
-  if (d2p & (D2P_DP_EN | D2P_DN_EN)) {
+  if (d2p & (D2P_DP_EN | D2P_DN_EN | D2P_D_EN)) {
     if (ctx->state == ST_SEND) {
       printf("USB: %4x %8d error state %s hs %s and device drives\n",
              ctx->frame, ctx->tick, st_states[ctx->state],
@@ -105,17 +114,56 @@
     }
   }
 
-  dp = (((d2p & D2P_DP_EN) && (d2p & D2P_DP)) ||
-        (!(d2p & D2P_DP_EN) && (d2p & D2P_PU)))
-           ? 1
-           : 0;
-
-  dn = ((d2p & D2P_DN_EN) && (d2p & D2P_DN)) ? 1 : 0;
+  if ((d2p & D2P_DNPU) && (d2p & D2P_DPPU)) {
+    printf("USB: %4x %8d error both pullups are driven\n", ctx->frame,
+           ctx->tick);
+  }
+  if ((d2p & D2P_PU) != ctx->last_pu) {
+    n = snprintf(obuf, MAX_OBUF, "%4x %8d Pullup change to %s%s%s\n",
+                 ctx->frame, ctx->tick, (d2p & D2P_DPPU) ? "DP Pulled up " : "",
+                 (d2p & D2P_DNPU) ? "DN Pulled up " : "",
+                 (d2p & D2P_TXMODE_SE) ? "SingleEnded" : "Differential");
+    ssize_t written = fwrite(obuf, sizeof(char), (size_t)n, ctx->mon_file);
+    assert(written == n);
+    ctx->last_pu = d2p & D2P_PU;
+  }
+  if (d2p & D2P_TXMODE_SE) {
+    // Normal D+/D- mode
+    if (d2p & D2P_DNPU) {
+      // DN pullup would say DP and DN are swapped
+      dp = ((d2p & D2P_DN_EN) && (d2p & D2P_DN)) ||
+           (!(d2p & D2P_DN_EN) && (d2p & D2P_DNPU));
+      dn = (d2p & D2P_DP_EN) && (d2p & D2P_DP);
+    } else {
+      // No DN pullup so normal orientation
+      dp = ((d2p & D2P_DP_EN) && (d2p & D2P_DP)) ||
+           (!(d2p & D2P_DP_EN) && (d2p & D2P_DPPU));
+      dn = (d2p & D2P_DN_EN) && (d2p & D2P_DN);
+    }
+  } else {
+    // "differential" mode uses D and SE0
+    if (d2p & D2P_D_EN) {
+      if (d2p & D2P_DNPU) {
+        // Pullup says swap i.e. D is inverted
+        dp = (d2p & D2P_SE0) ? 0 : ((d2p & D2P_D) ? 0 : 1);
+        dn = (d2p & D2P_SE0) ? 0 : ((d2p & D2P_D) ? 1 : 0);
+      } else {
+        dp = (d2p & D2P_SE0) ? 0 : ((d2p & D2P_D) ? 1 : 0);
+        dn = (d2p & D2P_SE0) ? 0 : ((d2p & D2P_D) ? 0 : 1);
+      }
+    } else {
+      dp = (d2p & D2P_PU) ? 1 : 0;
+      dn = 0;
+    }
+  }
 
   if (ctx->loglevel & LOG_BIT) {
-    n = snprintf(obuf, MAX_OBUF, "%4x %8d %s %s %s\n", ctx->frame, ctx->tick,
-                 raw_str, (d2p & D2P_PU) ? "PU" : "  ",
-                 (ctx->state == ST_GET) ? decode_usb[dp << 1 | dn] : "ZZ ");
+    const char *pullup = (d2p & D2P_PU) ? "PU" : "  ";
+    const char *state =
+        (ctx->state == ST_GET) ? decode_usb[dp << 1 | dn] : "ZZ ";
+
+    n = snprintf(obuf, MAX_OBUF, "%4x %8d %s %s %s %x\n", ctx->frame, ctx->tick,
+                 raw_str, pullup, state, d2p);
     ssize_t written = fwrite(obuf, sizeof(char), (size_t)n, ctx->mon_file);
     assert(written == n);
   }
@@ -605,6 +653,37 @@
   }
 }
 
+int set_driving(struct usbdpi_ctx *ctx, int d2p, int newval) {
+  if (d2p & D2P_DNPU) {
+    if (d2p & D2P_TXMODE_SE) {
+      return (ctx->driving & P2D_SENSE) | ((newval & P2D_DP) ? P2D_DN : 0) |
+             ((newval & P2D_DN) ? P2D_DP : 0);
+    }
+    if (newval & (P2D_DP | P2D_DN)) {
+      // sets single ended lines to K after swapping
+      return (ctx->driving & P2D_SENSE) | P2D_DP |
+             ((newval & P2D_DN) ? P2D_D : 0);
+    }
+    // SE0 so D could be anything (make it 1 after swapping)
+    return ctx->driving & P2D_SENSE;
+  }
+  if (d2p & D2P_TXMODE_SE) {
+    return (ctx->driving & P2D_SENSE) | newval;
+  }
+  if (newval & (P2D_DP | P2D_DN)) {
+    // sets single ended lines to K
+    return (ctx->driving & P2D_SENSE) | P2D_DN |
+           ((newval & P2D_DP) ? P2D_D : 0);
+  }
+  // SE0 so D could be anything (make it 1)
+  return (ctx->driving & P2D_SENSE) | P2D_D;
+}
+
+int inv_driving(struct usbdpi_ctx *ctx, int d2p) {
+  // works for either orientation
+  return ctx->driving ^ ((d2p & D2P_TXMODE_SE) ? (P2D_DP | P2D_DN) : P2D_D);
+}
+
 char usbdpi_host_to_device(void *ctx_void, const svBitVecVal *usb_d2p) {
   struct usbdpi_ctx *ctx = (struct usbdpi_ctx *)ctx_void;
   assert(ctx);
@@ -615,10 +694,12 @@
 
   if (ctx->tick == 0) {
     int i;
+#ifdef NEED_SLEEP
     for (i = 7; i > 0; i--) {
       printf("Sleep %d...\n", i);
       sleep(1);
     }
+#endif
   }
   ctx->tick++;
   ctx->tick_bits = ctx->tick >> 2;
@@ -727,7 +808,7 @@
 
     case ST_SYNC:
       dat = ((USB_SYNC & ctx->bit)) ? P2D_DP : P2D_DN;
-      ctx->driving = (ctx->driving & P2D_SENSE) | dat;
+      ctx->driving = set_driving(ctx, d2p, dat);
       force_stat = 1;
       ctx->bit <<= 1;
       if (ctx->bit == 0x100) {
@@ -741,19 +822,19 @@
       if ((ctx->linebits & 0x3f) == 0x3f &&
           !INSERT_ERR_BITSTUFF) {  // sent 6 ones
         // bit stuff and force a transition
-        ctx->driving ^= (P2D_DP | P2D_DN);
+        ctx->driving = inv_driving(ctx, d2p);
         force_stat = 1;
         ctx->linebits = (ctx->linebits << 1);
       } else if (ctx->byte >= ctx->bytes) {
         ctx->state = ST_EOP;
-        ctx->driving = ctx->driving & P2D_SENSE;  // SE0
+        ctx->driving = set_driving(ctx, d2p, 0);  // SE0
         ctx->bit = 1;
         force_stat = 1;
       } else {
         int nextbit;
         nextbit = (ctx->data[ctx->byte] & ctx->bit) ? 1 : 0;
         if (nextbit == 0) {
-          ctx->driving ^= (P2D_DP | P2D_DN);
+          ctx->driving = inv_driving(ctx, d2p);
         }
         ctx->linebits = (ctx->linebits << 1) | nextbit;
         force_stat = 1;
@@ -769,18 +850,17 @@
       break;
 
     case ST_EOP0:
-      ctx->driving = ctx->driving & P2D_SENSE;  // SE0
+      ctx->driving = set_driving(ctx, d2p, 0);  // SE0
       ctx->state = ST_EOP;
       break;
 
     case ST_EOP:  // SE0 SE0 J
       if (ctx->bit == 4) {
-        ctx->driving = (ctx->driving & P2D_SENSE) | P2D_DP;  // J
+        ctx->driving = set_driving(ctx, d2p, P2D_DP);  // J
       }
       if (ctx->bit == 8) {
-        ctx->driving = (d2p & D2P_PU) ? (ctx->driving & P2D_SENSE) | P2D_DP
-                                      :               // Z + pullup
-                           ctx->driving & P2D_SENSE;  // z without pullup = SE0
+        // Stop driving: host pulldown to SE0 unless there is a pullup on DP
+        ctx->driving = set_driving(ctx, d2p, (d2p & D2P_PU) ? P2D_DP : 0);
         if (ctx->byte == ctx->datastart) {
           ctx->bit = 1;
           ctx->state = ST_SYNC;
diff --git a/hw/dv/dpi/usbdpi/usbdpi.h b/hw/dv/dpi/usbdpi/usbdpi.h
index 0aaa587..da00193 100644
--- a/hw/dv/dpi/usbdpi/usbdpi.h
+++ b/hw/dv/dpi/usbdpi/usbdpi.h
@@ -33,16 +33,25 @@
 // Index of the unimplemented endpoint to test
 #define UNIMPL_EP_ID 15
 
-#define D2P_BITS 5
-#define D2P_DP 16
-#define D2P_DP_EN 8
-#define D2P_DN 4
-#define D2P_DN_EN 2
-#define D2P_PU 1
+#define D2P_BITS 11
+#define D2P_DP 1024
+#define D2P_DP_EN 512
+#define D2P_DN 256
+#define D2P_DN_EN 128
+#define D2P_D 64
+#define D2P_D_EN 32
+#define D2P_SE0 16
+#define D2P_SE0_EN 8
+#define D2P_DPPU 4
+#define D2P_DNPU 2
+#define D2P_TXMODE_SE 1
+// Either pullup (dp/dn swapped if the pullup is on DN)
+#define D2P_PU (D2P_DPPU | D2P_DNPU)
 
 #define P2D_SENSE 1
 #define P2D_DN 2
 #define P2D_DP 4
+#define P2D_D 8
 
 #define ST_IDLE 0
 #define ST_SEND 1
@@ -95,6 +104,7 @@
   FILE *mon_file;
   char mon_pathname[PATH_MAX];
   void *mon;
+  int last_pu;
   int lastrxpid;
   int tick;
   int tick_bits;
diff --git a/hw/dv/dpi/usbdpi/usbdpi.sv b/hw/dv/dpi/usbdpi/usbdpi.sv
index 8a353aa..0f3d88a 100644
--- a/hw/dv/dpi/usbdpi/usbdpi.sv
+++ b/hw/dv/dpi/usbdpi/usbdpi.sv
@@ -6,6 +6,7 @@
 
 // Bits in LOG_LEVEL sets what is output on socket
 // 0x01 -- monitor_usb (packet level)
+// 0x02 -- more verbose monitor
 // 0x08 -- bit level
 
 module usbdpi #(
@@ -21,21 +22,31 @@
   output logic dn_p2d,
   input  logic dn_d2p,
   input  logic dn_en_d2p,
+  output logic d_p2d,
+  input  logic d_d2p,
+  input  logic d_en_d2p,
+  input  logic se0_d2p,
+  input  logic se0_en_d2p,
+  input  logic txmode_d2p,
+  input  logic txmode_en_d2p,
+
   output logic sense_p2d,
-  input  logic pullup_d2p,
-  input  logic pullup_en_d2p
+  input  logic pullupdp_d2p,
+  input  logic pullupdp_en_d2p,
+  input  logic pullupdn_d2p,
+  input  logic pullupdn_en_d2p
 );
   import "DPI-C" function
     chandle usbdpi_create(input string name, input int loglevel);
 
   import "DPI-C" function
-    void usbdpi_device_to_host(input chandle ctx, input bit [4:0] d2p);
+    void usbdpi_device_to_host(input chandle ctx, input bit [10:0] d2p);
 
   import "DPI-C" function
     void usbdpi_close(input chandle ctx);
 
   import "DPI-C" function
-    byte usbdpi_host_to_device(input chandle ctx, input bit [4:0] d2p);
+    byte usbdpi_host_to_device(input chandle ctx, input bit [10:0] d2p);
 
   chandle ctx;
 
@@ -47,32 +58,39 @@
     usbdpi_close(ctx);
   end
 
-  logic [4:0] d2p;
-  logic [4:0] d2p_r;
+  logic [10:0] d2p;
+  logic [10:0] d2p_r;
   logic       unused_dummy;
   logic       unused_clk = clk_i;
   logic       unused_rst = rst_ni;
-  logic       dp_int, dn_int;
+  logic       dp_int, dn_int, d_int;
 
-  assign d2p = {dp_d2p, dp_en_d2p, dn_d2p, dn_en_d2p, pullup_d2p & pullup_en_d2p};
+  assign d2p = {dp_d2p, dp_en_d2p, dn_d2p, dn_en_d2p, d_d2p, d_en_d2p, se0_d2p, se0_en_d2p, pullupdp_d2p & pullupdp_en_d2p, pullupdn_d2p & pullupdn_en_d2p, txmode_d2p & txmode_en_d2p};
   always_ff @(posedge clk_48MHz_i) begin
-    if (pullup_d2p && pullup_en_d2p) begin
+    if ((pullupdp_d2p && pullupdp_en_d2p) || (pullupdn_d2p && pullupdn_en_d2p)) begin
       automatic byte p2d = usbdpi_host_to_device(ctx, d2p);
+      d_int <= p2d[3];
       dp_int <= p2d[2];
       dn_int <= p2d[1];
       sense_p2d <= p2d[0];
-      unused_dummy <= |p2d[7:3];
+      unused_dummy <= |p2d[7:4];
       d2p_r <= d2p;
       if (d2p_r != d2p) begin
         usbdpi_device_to_host(ctx, d2p);
       end
-    end else begin
+    end else begin // if (pullupdp_d2p && pullupdp_en_d2p)
+      d_int <= 0;
       dp_int <= 0;
       dn_int <= 0;
     end
   end
 
   always_comb begin : proc_data
+    if (d_en_d2p) begin
+      d_p2d = d_d2p;
+    end else begin
+      d_p2d = d_int;
+    end
     if (dp_en_d2p) begin
       dp_p2d = dp_d2p;
     end else begin
diff --git a/hw/ip/usb_fs_nb_pe/rtl/usb_fs_nb_pe.sv b/hw/ip/usb_fs_nb_pe/rtl/usb_fs_nb_pe.sv
index 77b948b..afc5697 100644
--- a/hw/ip/usb_fs_nb_pe/rtl/usb_fs_nb_pe.sv
+++ b/hw/ip/usb_fs_nb_pe/rtl/usb_fs_nb_pe.sv
@@ -26,6 +26,7 @@
   input  logic [6:0]             dev_addr_i,
 
   input  logic                   cfg_eop_single_bit_i, // 1: detect a single SE0 bit as EOP
+  input  logic                   cfg_rx_differential_i, // 1: use differential rx data on usb_d_i
   input  logic                   tx_osc_test_mode_i, // Oscillator test mode (constantly output JK)
   input  logic [NumOutEps-1:0]   data_toggle_clear_i, // Clear the data toggles for an EP
 
@@ -66,17 +67,25 @@
   output logic                   sof_valid_o,
   output logic [10:0]            frame_index_o,
 
+  // RX line status
+  output logic                   rx_se0_det_o,
+  output logic                   rx_jjj_det_o,
+
   // RX errors
   output logic                   rx_crc_err_o,
   output logic                   rx_pid_err_o,
   output logic                   rx_bitstuff_err_o,
 
   ///////////////////////////////////////
-  // USB TX/RX Interface (synchronous) //
+  // USB RX Interface (synchronous)    //
   ///////////////////////////////////////
   input  logic                   usb_d_i,
-  input  logic                   usb_se0_i,
+  input  logic                   usb_dp_i,
+  input  logic                   usb_dn_i,
 
+  ///////////////////////////////////////
+  // USB TX Interface (synchronous)    //
+  ///////////////////////////////////////
   output logic                   usb_d_o,
   output logic                   usb_se0_o,
   output logic                   usb_oe_o
@@ -203,8 +212,10 @@
     .rst_ni                 (rst_ni),
     .link_reset_i           (link_reset_i),
     .cfg_eop_single_bit_i   (cfg_eop_single_bit_i),
+    .cfg_rx_differential_i  (cfg_rx_differential_i),
     .usb_d_i                (usb_d_i),
-    .usb_se0_i              (usb_se0_i),
+    .usb_dp_i               (usb_dp_i),
+    .usb_dn_i               (usb_dn_i),
     .tx_en_i                (usb_oe),
     .bit_strobe_o           (bit_strobe),
     .pkt_start_o            (rx_pkt_start),
@@ -216,6 +227,8 @@
     .rx_data_put_o          (rx_data_put),
     .rx_data_o              (rx_data),
     .valid_packet_o         (rx_pkt_valid),
+    .rx_se0_det_o           (rx_se0_det_o),
+    .rx_jjj_det_o           (rx_jjj_det_o),
     .crc_error_o            (rx_crc_err_o),
     .pid_error_o            (rx_pid_err_o),
     .bitstuff_error_o       (rx_bitstuff_err_o)
diff --git a/hw/ip/usb_fs_nb_pe/rtl/usb_fs_rx.sv b/hw/ip/usb_fs_nb_pe/rtl/usb_fs_rx.sv
index b550b80..6d61182 100644
--- a/hw/ip/usb_fs_nb_pe/rtl/usb_fs_rx.sv
+++ b/hw/ip/usb_fs_nb_pe/rtl/usb_fs_rx.sv
@@ -10,12 +10,14 @@
   input  logic rst_ni,
   input  logic link_reset_i,
 
-  // EOP configuration
+  // configuration
   input  logic cfg_eop_single_bit_i,
+  input  logic cfg_rx_differential_i,
 
   // USB data+ and data- lines (synchronous)
   input  logic usb_d_i,
-  input  logic usb_se0_i,
+  input  logic usb_dp_i,
+  input  logic usb_dn_i,
 
   // Transmit enable disables the receier
   input  logic tx_en_i,
@@ -42,6 +44,10 @@
   // Most recent packet passes PID and CRC checks
   output logic valid_packet_o,
 
+  // line status for the status detection (actual rx bits after clock recovery)
+  output logic rx_se0_det_o,
+  output logic rx_jjj_det_o,
+
   // Error detection
   output logic crc_error_o,
   output logic pid_error_o,
@@ -56,12 +62,13 @@
   // usb receive path //
   //////////////////////
 
+
   ///////////////////////////////////////
   // line state recovery state machine //
   ///////////////////////////////////////
 
-  // The receive path doesn't currently use a differential reciever.  because of
-  // this there is a chance that one of the differential pairs will appear to have
+  // If the receive path is set not to use a differential reciever:
+  // There is a chance that one of the differential pairs will appear to have
   // changed to the new state while the other is still in the old state.  the
   // following state machine detects transitions and waits an extra sampling clock
   // before decoding the state on the differential pair.  this transition period
@@ -69,31 +76,55 @@
   // if there is enough noise on the line then the data may be corrupted and the
   // packet will fail the data integrity checks.
 
-  logic [2:0] line_state_q, line_state_d;
+  // If the receive path uses a differential receiver:
+  // The single ended signals must still be recovered to detect SE0
+  // Note that the spec warns in section 7.1.4.1:
+  // Both D+ and D- may temporarily be less than VIH (min) during differential
+  // signal transitions. This period can be up to 14 ns (TFST) for full-speed
+  // transitions and up to 210 ns (TLST) for low-speed transitions. Logic in the
+  // receiver must ensure that that this is not interpreted as an SE0.
+  // Since the 48MHz sample clock is 20.833ns period we will either miss this or
+  // sample it only once, so it will be covered by line_state=DT and the next
+  // sample will not be SE0 unless this was a real SE0 transition
+  // Note: if it is a real SE0 the differential rx could be doing anything
+
+  logic [2:0] line_state_qq, line_state_q, line_state_d;
+  logic [2:0] diff_state_q, diff_state_d;
+  logic [2:0] line_state_rx;
+  logic       use_se;
+
   localparam logic [2:0]  DT = 3'b100; // transition state
   localparam logic [2:0]  DJ = 3'b010; // J - idle line state
-  // localparam logic [2:0]  DK = 3'b001; // K - inverse of J
+  localparam logic [2:0]  DK = 3'b001; // K - inverse of J
   localparam logic [2:0] SE0 = 3'b000; // single-ended 0 - end of packet or detached
   // localparam logic [2:0] SE1 = 3'b011; // single-ended 1 - illegal
 
   // Mute the input if we're transmitting
-  logic [1:0] dpair;
+  logic [1:0] dpair, ddiff;
   always_comb begin : proc_dpair_mute
     if (tx_en_i) begin
       dpair = DJ[1:0]; // J
+      ddiff = DJ[1:0]; // J
     end else begin
-      dpair = (usb_se0_i) ? 2'b00 : {usb_d_i, ~usb_d_i};
+      dpair = {usb_dp_i, usb_dn_i};
+      ddiff = usb_d_i ? DJ[1:0] : DK[1:0]; // equiv to {usb_d_i, ~usb_d_i}
     end
   end
 
   always_ff @(posedge clk_i or negedge rst_ni) begin : proc_line_state_q
     if (!rst_ni) begin
       line_state_q <= SE0;
+      line_state_qq <= SE0;
+      diff_state_q <= SE0;
     end else begin
       if (link_reset_i) begin
         line_state_q <= SE0;
+        line_state_qq <= SE0;
+        diff_state_q <= SE0;
       end else begin
         line_state_q <= line_state_d;
+        line_state_qq <= line_state_q;
+        diff_state_q <= diff_state_d;
       end
     end
   end
@@ -116,6 +147,64 @@
     end
   end
 
+  always_comb begin : proc_diff_state_d
+    // Default assignment
+    diff_state_d = diff_state_q;
+
+    if (diff_state_q == DT) begin
+      // if we are in a transition state, then we can sample the diff input and
+      // move to the next corresponding line state
+      diff_state_d = {1'b0, ddiff};
+
+    end else begin
+      // if we are in a valid line state and the value of the diff input changes,
+      // then we need to move to the transition state
+      if (ddiff != diff_state_q[1:0]) begin
+        diff_state_d = DT;
+      end
+    end
+  end
+
+  // The received line state depends on how the receiver is configured:
+  // Single ended only: it is just the line_state_q that was captured
+  //
+  // Differential: recovered from the differential receiver (diff_state_q)
+  //               unless the single ended indicate SE0 when the differential
+  //               receiver could produce any value
+  //
+  // Transition where single ended happens to see SE0 look like (driven by diff DT)
+  // line_state    D? DT D?...
+  // diff_state    Dx DT Dy         (expect Dy to be inverse of Dx since diff changed)
+  //
+  // Transition to SE0 when differential changes will look like:
+  // line_state    DT D? D? D? DT SE0 SE0... (DT is the first sample at SE0)
+  // diff_state    DT Dx Dx Dx DT ??  ??...  (diff saw transition as line went SE0)
+  //    --> out    DT Dx Dx Dx DT SE0 SE0    (if no transition then DT would be Dx and n=3)
+  // bit_phase      n  0  1  2  3  0   1     (n=3 unless there was a clock resync)
+  //
+  // Transition to SE0 when differential does not change will look like:
+  // line_state    DT D? D? D? DT SE0 SE0... (DT is the first sample at SE0)
+  // diff_state    DT Dx Dx Dx Dx ??  ??...  (diff no transition as line went SE0)
+  //    --> out    DT Dx Dx Dx Dx SE0 SE0    (if no transition then DT would be Dx and n=3)
+  // bit_phase      n  0  1  2  3  0   1     (n=3 unless there was a clock resync)
+  //
+  // Transition to SE0 when differential does not change and clock resync earlier:
+  // line_state    DT D? D? DT SE0 SE0 SE0... (DT is the first sample at SE0, should resync clock)
+  // diff_state    DT Dx Dx Dx Dx  ??  ??...  (diff no transition as line went SE0)
+  //    --> out    DT Dx Dx Dx SE0 SE0 SE0    (if no transition then DT would be Dx and n=3)
+  // bit_phase      n  0  1  2  3   0   1     (n=3 unless there was a clock resync)
+  //
+  // On transition back from SE0 want to generate a DT to resync the clock
+  // since SE0 could have gone on a while no idea what bit_phase is
+  // line_state    SE0 SE0 DT D? D? D?
+  // diff_state    ??  ??  ?? Dx Dx Dx
+  //   --> out     SE0 SE0 DT Dx Dx Dx
+  // bit_phase      ?   ?   ?  0  1  2
+
+  assign use_se = (line_state_q == SE0) || ((line_state_q == DT) && (line_state_qq == SE0));
+  assign line_state_rx = cfg_rx_differential_i ? (use_se ? line_state_q : diff_state_q) :
+                                                 line_state_q;
+
   ////////////////////
   // clock recovery //
   ////////////////////
@@ -137,7 +226,7 @@
   assign bit_strobe_o     = (bit_phase_q == 2'd2);
 
   // keep track of phase within each bit
-  assign bit_phase_d = (line_state_q == DT) ? 0 : bit_phase_q + 1;
+  assign bit_phase_d = (line_state_rx == DT) ? 0 : bit_phase_q + 1;
 
   always_ff @(posedge clk_i or negedge rst_ni) begin : proc_bit_phase_q
     if (!rst_ni) begin
@@ -193,7 +282,7 @@
   end
 
   // keep a history of the last two states on the line
-  assign line_history_d = line_state_valid ? {line_history_q[9:0], line_state_q[1:0]} :
+  assign line_history_d = line_state_valid ? {line_history_q[9:0], line_state_rx[1:0]} :
                                               line_history_q;
 
   always_ff @(posedge clk_i or negedge rst_ni) begin : proc_reg_pkt_line
@@ -211,6 +300,9 @@
     end
   end
 
+  // mask out jjj detection when transmitting (because rx is forced to J)
+  assign rx_se0_det_o = line_history_q[5:0] == 6'b000000; // three SE0s
+  assign rx_jjj_det_o = ~tx_en_i & (line_history_q[5:0] == 6'b101010); // three Js
 
   /////////////////
   // NRZI decode //
diff --git a/hw/ip/usbdev/data/usbdev.hjson b/hw/ip/usbdev/data/usbdev.hjson
index 67ee31a..831e265 100644
--- a/hw/ip/usbdev/data/usbdev.hjson
+++ b/hw/ip/usbdev/data/usbdev.hjson
@@ -209,14 +209,19 @@
               name: "suspend",
               desc: "Link suspended (constant idle for > 3 ms), was active before becoming suspended"
             },
-
+            { value: "5",
+              name: "active_nosof",
+              desc: "Link active but no SOF has been received since the last reset."
+            },
           ]
         }
         {
           bits: "15",
           name: "sense",
           desc: '''
-                Reflects the state of the sense pin. 1 indicates that the host is providing VBUS.
+                Reflects the state of the sense pin.
+		1 indicates that the host is providing VBUS.
+		Note that this bit always shows the state of the actual pin and does not take account of the override control.
                 '''
         }
         {
diff --git a/hw/ip/usbdev/rtl/usbdev.sv b/hw/ip/usbdev/rtl/usbdev.sv
index 08a5231..8e17389 100644
--- a/hw/ip/usbdev/rtl/usbdev.sv
+++ b/hw/ip/usbdev/rtl/usbdev.sv
@@ -141,13 +141,20 @@
 
 
   /////////////////////////////////
-  // USB IO after CDC & muxing   //
+  // USB RX after CDC & muxing   //
   /////////////////////////////////
   logic usb_rx_d;
-  logic usb_rx_se0;
+  logic usb_rx_dp;
+  logic usb_rx_dn;
+  /////////////////////////////////
+  // USB TX after CDC & muxing   //
+  /////////////////////////////////
   logic usb_tx_d;
   logic usb_tx_se0;
   logic usb_tx_oe;
+  /////////////////////////////////
+  // USB contol pins after CDC   //
+  /////////////////////////////////
   logic usb_pwr_sense;
   logic usb_pullup_en;
 
@@ -471,7 +478,8 @@
 
     // Pins
     .usb_d_i              (usb_rx_d),
-    .usb_se0_i            (usb_rx_se0),
+    .usb_dp_i             (usb_rx_dp),
+    .usb_dn_i             (usb_rx_dn),
     .usb_oe_o             (usb_tx_oe),
     .usb_d_o              (usb_tx_d),
     .usb_se0_o            (usb_tx_se0),
@@ -518,6 +526,7 @@
     .ep_iso_i             (ep_iso), // cdc ok, quasi-static
     .cfg_eop_single_bit_i (reg2hw.phy_config.eop_single_bit.q), // cdc ok: quasi-static
     .tx_osc_test_mode_i   (reg2hw.phy_config.tx_osc_test_mode.q), // cdc ok: quasi-static
+    .cfg_rx_differential_i (reg2hw.phy_config.rx_differential_mode.q), // cdc ok: quasi-static
     .data_toggle_clear_i  (usb_data_toggle_clear),
 
     // status
@@ -944,7 +953,8 @@
 
     // Internal interface
     .usb_rx_d_o             (usb_rx_d),
-    .usb_rx_se0_o           (usb_rx_se0),
+    .usb_rx_dp_o            (usb_rx_dp),
+    .usb_rx_dn_o            (usb_rx_dn),
     .usb_tx_d_i             (usb_tx_d),
     .usb_tx_se0_i           (usb_tx_se0),
     .usb_tx_oe_i            (usb_tx_oe),
diff --git a/hw/ip/usbdev/rtl/usbdev_iomux.sv b/hw/ip/usbdev/rtl/usbdev_iomux.sv
index 2dffd0d..5c3ea20 100644
--- a/hw/ip/usbdev/rtl/usbdev_iomux.sv
+++ b/hw/ip/usbdev/rtl/usbdev_iomux.sv
@@ -40,12 +40,11 @@
 
   // Internal USB Interface (usb clk)
   output logic                          usb_rx_d_o,
-  output logic                          usb_rx_se0_o,
-
+  output logic                          usb_rx_dp_o,
+  output logic                          usb_rx_dn_o,
   input  logic                          usb_tx_d_i,
   input  logic                          usb_tx_se0_i,
   input  logic                          usb_tx_oe_i,
-
   output logic                          usb_pwr_sense_o,
   input  logic                          usb_pullup_en_i,
   input  logic                          usb_suspend_i
@@ -53,15 +52,16 @@
 
   logic async_pwr_sense, sys_usb_sense;
   logic cio_usb_dp, cio_usb_dn, cio_usb_d;
-  logic usb_rx_dp, usb_rx_dn, usb_rx_d;
   logic pinflip;
   logic unused_eop_single_bit;
+  logic unused_rx_differential_mode;
   logic unused_usb_ref_disable;
   logic unused_tx_osc_test_mode;
 
-  assign unused_eop_single_bit   = sys_reg2hw_config_i.eop_single_bit.q;
-  assign unused_usb_ref_disable  = sys_reg2hw_config_i.usb_ref_disable.q;
-  assign unused_tx_osc_test_mode = sys_reg2hw_config_i.tx_osc_test_mode.q;
+  assign unused_eop_single_bit       = sys_reg2hw_config_i.eop_single_bit.q;
+  assign unused_usb_ref_disable      = sys_reg2hw_config_i.usb_ref_disable.q;
+  assign unused_tx_osc_test_mode     = sys_reg2hw_config_i.tx_osc_test_mode.q;
+  assign unused_rx_differential_mode = sys_reg2hw_config_i.rx_differential_mode.q;
 
   //////////
   // CDCs //
@@ -113,11 +113,12 @@
 
     // The single-ended signals are only driven in single-ended mode.
     if (sys_reg2hw_config_i.tx_differential_mode.q) begin
-      // Differential TX mode
+      // Differential TX mode (physical IO takes d and se0)
+      // i.e. expect the "else" logic to be in the physical interface
       cio_usb_tx_mode_se_o   = 1'b0;
 
     end else begin
-      // Single-ended TX mode
+      // Single-ended TX mode (physical IO takes dp and dn)
       cio_usb_tx_mode_se_o   = 1'b1;
       if (usb_tx_se0_i) begin
         cio_usb_dp_o = 1'b0;
@@ -139,26 +140,10 @@
   // USB input pin mux //
   ///////////////////////
 
-  // Note that while transmitting, we fix the receive line to 1. If the receive line isn't fixed,
-  // we are trying to regenerate the bit clock from the bit clock we are regenerating, rather than
-  // just holding the phase.
   // D+/D- can be swapped based on a config register.
-  assign usb_rx_dp = usb_tx_oe_i ? 1'b1 : (pinflip ?  cio_usb_dn : cio_usb_dp);
-  assign usb_rx_dn = usb_tx_oe_i ? 1'b0 : (pinflip ?  cio_usb_dp : cio_usb_dn);
-  assign usb_rx_d  = usb_tx_oe_i ? 1'b1 : (pinflip ? ~cio_usb_d  : cio_usb_d);
-
-  always_comb begin : proc_diff_se_mux_in
-    usb_rx_se0_o = ~usb_rx_dp & ~usb_rx_dn;
-
-    if (sys_reg2hw_config_i.rx_differential_mode.q) begin
-      // Differential RX mode
-      usb_rx_d_o = usb_rx_d;
-
-    end else begin
-      // Single-ended RX mode
-      usb_rx_d_o = usb_rx_dp; // SE1 is interpreted as differential 1
-    end
-  end
+  assign usb_rx_dp_o = pinflip ?  cio_usb_dn : cio_usb_dp;
+  assign usb_rx_dn_o = pinflip ?  cio_usb_dp : cio_usb_dn;
+  assign usb_rx_d_o  = pinflip ? ~cio_usb_d  : cio_usb_d;
 
   // Power sense mux
   always_comb begin : proc_mux_pwr_sense
diff --git a/hw/ip/usbdev/rtl/usbdev_linkstate.sv b/hw/ip/usbdev/rtl/usbdev_linkstate.sv
index d4dea2d..acc151d 100644
--- a/hw/ip/usbdev/rtl/usbdev_linkstate.sv
+++ b/hw/ip/usbdev/rtl/usbdev_linkstate.sv
@@ -10,8 +10,8 @@
   input  logic rst_ni,
   input  logic us_tick_i,
   input  logic usb_sense_i,
-  input  logic usb_rx_d_i,
-  input  logic usb_rx_se0_i,
+  input  logic rx_se0_det_i,
+  input  logic rx_jjj_det_i,
   input  logic sof_valid_i,
   output logic link_disconnect_o,  // level
   output logic link_connect_o,     // level
@@ -35,6 +35,7 @@
     LinkPoweredSuspend = 2,
     // Active states
     LinkActive = 3,
+    LinkActiveNoSOF = 5,
     LinkSuspend = 4
   } link_state_e;
 
@@ -51,8 +52,7 @@
   } link_inac_state_e;
 
   link_state_e  link_state_d, link_state_q;
-  logic         line_se0_raw, line_idle_raw;
-  logic         see_se0, see_idle, see_pwr_sense;
+  logic         see_pwr_sense;
 
   // Reset FSM
   logic [2:0]      link_rst_timer_d, link_rst_timer_q;
@@ -75,32 +75,11 @@
   assign link_connect_o    = (link_state_q != LinkDisconnect);
   assign link_suspend_o    = (link_state_q == LinkSuspend ||
     link_state_q == LinkPoweredSuspend);
-  assign link_active_o     = (link_state_q == LinkActive);
+  assign link_active_o     = (link_state_q == LinkActive) ||
+    (link_state_q == LinkActiveNoSOF);
   // Link state is stable, so we can output it to the register
   assign link_state_o      =  link_state_q;
 
-  assign line_se0_raw = usb_rx_se0_i;
-  assign line_idle_raw = usb_rx_d_i && !usb_rx_se0_i; // same as J
-
-  // four ticks is a bit time
-  // Could completely filter out 2-cycle EOP SE0 here but
-  // does not seem needed
-  prim_filter #(.Cycles(6)) filter_se0 (
-    .clk_i    (clk_48mhz_i),
-    .rst_ni   (rst_ni),
-    .enable_i (1'b1),
-    .filter_i (line_se0_raw),
-    .filter_o (see_se0)
-  );
-
-  prim_filter #(.Cycles(6)) filter_idle (
-    .clk_i    (clk_48mhz_i),
-    .rst_ni   (rst_ni),
-    .enable_i (1'b1),
-    .filter_i (line_idle_raw),
-    .filter_o (see_idle)
-  );
-
   prim_filter #(.Cycles(6)) filter_pwr_sense (
     .clk_i    (clk_48mhz_i),
     .rst_ni   (rst_ni),
@@ -110,13 +89,14 @@
   );
 
   // Simple events
-  assign ev_bus_active = !see_idle;
+  assign ev_bus_active = !rx_jjj_det_i;
+
+  assign monitor_inac = see_pwr_sense ? ((link_state_q == LinkPowered) | link_active_o) :
+                        1'b0;
 
   always_comb begin
     link_state_d = link_state_q;
     link_resume_o = 0;
-    monitor_inac = see_pwr_sense ? ((link_state_q == LinkPowered) | (link_state_q == LinkActive)) :
-                                   1'b0;
 
     // If VBUS ever goes away the link has disconnected
     if (!see_pwr_sense) begin
@@ -132,7 +112,7 @@
 
         LinkPowered: begin
           if (ev_reset) begin
-            link_state_d = LinkActive;
+            link_state_d = LinkActiveNoSOF;
           end else if (ev_bus_inactive) begin
             link_state_d = LinkPoweredSuspend;
           end
@@ -140,22 +120,39 @@
 
         LinkPoweredSuspend: begin
           if (ev_reset) begin
-            link_state_d = LinkActive;
+            link_state_d = LinkActiveNoSOF;
           end else if (ev_bus_active) begin
             link_resume_o = 1;
             link_state_d  = LinkPowered;
           end
         end
 
+        // Active but not yet seen a frame
+        // One reason for getting stuck here is the host thinks it is a LS link
+        // which could happen if the flipped bit does not match the actual pins
+        // Annother is the SI is bad so good data is not recovered from the link
+        LinkActiveNoSOF: begin
+          if (ev_bus_inactive) begin
+            link_state_d = LinkSuspend;
+          end else if (sof_valid_i) begin
+            link_state_d = LinkActive;
+          end
+        end
+
         // Active (USB spec: Default / Address / Configured)
         LinkActive: begin
           if (ev_bus_inactive) begin
             link_state_d = LinkSuspend;
+          end else if (ev_reset) begin
+            link_state_d = LinkActiveNoSOF;
           end
         end
 
         LinkSuspend: begin
-          if (ev_reset || ev_bus_active) begin
+          if (ev_reset) begin
+            link_resume_o = 1;
+            link_state_d  = LinkActiveNoSOF;
+          end else if (ev_bus_active) begin
             link_resume_o = 1;
             link_state_d  = LinkActive;
           end
@@ -191,7 +188,7 @@
     unique case (link_rst_state_q)
       // No reset signal detected
       NoRst: begin
-        if (see_se0) begin
+        if (rx_se0_det_i) begin
           link_rst_state_d = RstCnt;
           link_rst_timer_d = 0;
         end
@@ -199,7 +196,7 @@
 
       // Reset signal detected -> counting
       RstCnt: begin
-        if (!see_se0) begin
+        if (!rx_se0_det_i) begin
           link_rst_state_d = NoRst;
         end else begin
           if (us_tick_i) begin
@@ -214,7 +211,7 @@
 
       // Detected reset -> wait for falling edge
       RstPend: begin
-        if (!see_se0) begin
+        if (!rx_se0_det_i) begin
           link_rst_state_d = NoRst;
           ev_reset = 1'b1;
         end
@@ -251,14 +248,14 @@
       // Active or disabled
       Active: begin
         link_inac_timer_d = 0;
-        if (see_idle && monitor_inac) begin
+        if (!ev_bus_active && monitor_inac) begin
           link_inac_state_d = InactCnt;
         end
       end
 
       // Got an inactivity signal -> count duration
       InactCnt: begin
-        if (!see_idle || !monitor_inac) begin
+        if (ev_bus_active || !monitor_inac) begin
           link_inac_state_d  = Active;
         end else if (us_tick_i) begin
           if (link_inac_timer_q == SUSPEND_TIMEOUT) begin
@@ -272,7 +269,7 @@
 
       // Counter expired & event sent, wait here
       InactPend: begin
-        if (!see_idle || !monitor_inac) begin
+        if (ev_bus_active || !monitor_inac) begin
           link_inac_state_d  = Active;
         end
       end
diff --git a/hw/ip/usbdev/rtl/usbdev_usbif.sv b/hw/ip/usbdev/rtl/usbdev_usbif.sv
index 66b7da6..39789d4 100644
--- a/hw/ip/usbdev/rtl/usbdev_usbif.sv
+++ b/hw/ip/usbdev/rtl/usbdev_usbif.sv
@@ -24,7 +24,8 @@
 
   // Pins (synchronous)
   input  logic                     usb_d_i,
-  input  logic                     usb_se0_i,
+  input  logic                     usb_dp_i,
+  input  logic                     usb_dn_i,
 
   output logic                     usb_d_o,
   output logic                     usb_se0_o,
@@ -72,6 +73,7 @@
   output logic                     clr_devaddr_o,
   input  logic [NEndpoints-1:0]    ep_iso_i,
   input  logic                     cfg_eop_single_bit_i, // 1: detect a single SE0 bit as EOP
+  input  logic                     cfg_rx_differential_i, // 1: use differential rx data on usb_d_i
   input  logic                     tx_osc_test_mode_i, // Oscillator test mode: constant JK output
   input  logic [NEndpoints-1:0]    data_toggle_clear_i, // Clear the data toggles for an EP
 
@@ -258,6 +260,7 @@
   assign set_sent_o = in_ep_acked;
 
   logic [10:0]     frame_index_raw;
+  logic            rx_se0_det, rx_jjj_det;
 
   usb_fs_nb_pe #(
     .NumOutEps      (NEndpoints),
@@ -269,11 +272,13 @@
     .link_reset_i          (link_reset),
 
     .cfg_eop_single_bit_i  (cfg_eop_single_bit_i),
+    .cfg_rx_differential_i (cfg_rx_differential_i),
     .tx_osc_test_mode_i    (tx_osc_test_mode_i),
     .data_toggle_clear_i   (data_toggle_clear_i),
 
     .usb_d_i               (usb_d_i),
-    .usb_se0_i             (usb_se0_i),
+    .usb_dp_i              (usb_dp_i),
+    .usb_dn_i              (usb_dn_i),
     .usb_d_o               (usb_d_o),
     .usb_se0_o             (usb_se0_o),
     .usb_oe_o              (usb_oe_o),
@@ -306,6 +311,10 @@
     .in_ep_data_done_i     (in_ep_data_done),
     .in_ep_iso_i           (ep_iso_i),
 
+    // rx status
+    .rx_se0_det_o          (rx_se0_det),
+    .rx_jjj_det_o          (rx_jjj_det),
+
     // error signals
     .rx_crc_err_o          (rx_crc_err_o),
     .rx_pid_err_o          (rx_pid_err_o),
@@ -349,8 +358,8 @@
     .rst_ni            (rst_ni),
     .us_tick_i         (us_tick),
     .usb_sense_i       (usb_sense_i),
-    .usb_rx_d_i        (usb_d_i),
-    .usb_rx_se0_i      (usb_se0_i),
+    .rx_se0_det_i      (rx_se0_det),
+    .rx_jjj_det_i      (rx_jjj_det),
     .sof_valid_i       (sof_valid),
     .link_disconnect_o (link_disconnect_o),
     .link_connect_o    (link_connect_o),
diff --git a/hw/top_earlgrey/data/pins_nexysvideo.xdc b/hw/top_earlgrey/data/pins_nexysvideo.xdc
index 67c28f2..f2123e5 100644
--- a/hw/top_earlgrey/data/pins_nexysvideo.xdc
+++ b/hw/top_earlgrey/data/pins_nexysvideo.xdc
@@ -130,12 +130,13 @@
 set_property -dict { PACKAGE_PIN V8    IOSTANDARD LVCMOS33 DRIVE 8 SLEW FAST } [get_ports { IO_USB_DN0 }]; #IO_L21N_T3_DQS_34 Sch=jb_n[1]
 set_property -dict { PACKAGE_PIN V7    IOSTANDARD LVCMOS33 } [get_ports { IO_USB_DPPULLUP0 }]; #IO_L19P_T3_34 Sch=jb_p[2]
 set_property -dict { PACKAGE_PIN W7    IOSTANDARD LVCMOS33 } [get_ports { IO_USB_SENSE0 }]; #IO_L19N_T3_VREF_34 Sch=jb_n[2]
+set_property -dict { PACKAGE_PIN Y8    IOSTANDARD LVCMOS33 } [get_ports { IO_USB_DNPULLUP0 }]; #IO_L23P_T3_34 Sch=jb_p[4]
+
+## Pmod header JB UNUSED pins (used for testing 2 USB interfaces)
 #set_property -dict { PACKAGE_PIN W9    IOSTANDARD LVCMOS33 DRIVE 8 SLEW FAST } [get_ports { IO_USB_DP1 }]; #IO_L24P_T3_34 Sch=jb_p[3]
 #set_property -dict { PACKAGE_PIN Y9    IOSTANDARD LVCMOS33 DRIVE 8 SLEW FAST } [get_ports { IO_USB_DN1 }]; #IO_L24N_T3_34 Sch=jb_n[3]
-set_property -dict { PACKAGE_PIN Y8    IOSTANDARD LVCMOS33 } [get_ports { IO_USB_DNPULLUP0 }]; #IO_L23P_T3_34 Sch=jb_p[4]
 #set_property -dict { PACKAGE_PIN Y7    IOSTANDARD LVCMOS33 } [get_ports { IO_USB_SENSE1 }]; #IO_L23N_T3_34 Sch=jb_n[4]
 
-
 ## Pmod header JC
 #set_property -dict { PACKAGE_PIN Y6    IOSTANDARD LVCMOS33 } [get_ports { IO_SDCK   }]; #IO_L18P_T2_34 Sch=jc_p[1]
 #set_property -dict { PACKAGE_PIN AA6   IOSTANDARD LVCMOS33 } [get_ports { IO_SDCSB  }]; #IO_L18N_T2_34 Sch=jc_n[1]
diff --git a/hw/top_earlgrey/rtl/top_earlgrey_nexysvideo.sv b/hw/top_earlgrey/rtl/top_earlgrey_nexysvideo.sv
index 9160cd1..e82b41c 100644
--- a/hw/top_earlgrey/rtl/top_earlgrey_nexysvideo.sv
+++ b/hw/top_earlgrey/rtl/top_earlgrey_nexysvideo.sv
@@ -60,9 +60,9 @@
   logic [padctrl_reg_pkg::NMioPads-1:0] mio_out_core, mio_out_padring;
   logic [padctrl_reg_pkg::NMioPads-1:0] mio_oe_core, mio_oe_padring;
   logic [padctrl_reg_pkg::NMioPads-1:0] mio_in_core, mio_in_padring;
-  logic [padctrl_reg_pkg::NDioPads-1:0] dio_out_core, dio_out_padring;
-  logic [padctrl_reg_pkg::NDioPads-1:0] dio_oe_core, dio_oe_padring;
-  logic [padctrl_reg_pkg::NDioPads-1:0] dio_in_core, dio_in_padring;
+  logic [padctrl_reg_pkg::NDioPads-1:0] dio_out_core, dio_out_umux, dio_out_padring;
+  logic [padctrl_reg_pkg::NDioPads-1:0] dio_oe_core, dio_oe_umux, dio_oe_padring;
+  logic [padctrl_reg_pkg::NDioPads-1:0] dio_in_core, dio_in_umux, dio_in_padring;
 
   padring #(
     // MIOs 31:20 are currently not
@@ -177,16 +177,68 @@
     .jtag_srst_no ( jtag_srst_n     ),
     .jtag_tdi_o   ( jtag_tdi        ),
     .jtag_tdo_i   ( jtag_tdo        ),
-    // To core side
-    .out_core_i   ( {dio_out_core, mio_out_core} ),
-    .oe_core_i    ( {dio_oe_core,  mio_oe_core}  ),
-    .in_core_o    ( {dio_in_core,  mio_in_core}  ),
+    // To core side via usbmux for DIOs
+    .out_core_i   ( {dio_out_umux, mio_out_core} ),
+    .oe_core_i    ( {dio_oe_umux,  mio_oe_core}  ),
+    .in_core_o    ( {dio_in_umux,  mio_in_core}  ),
     // To padring side
     .out_padring_o ( {dio_out_padring, mio_out_padring} ),
-    .oe_padring_o  ( {dio_oe_padring , mio_oe_padring } ),
-    .in_padring_i  ( {dio_in_padring , mio_in_padring } )
+    .oe_padring_o  ( {dio_oe_padring, mio_oe_padring } ),
+    .in_padring_i  ( {dio_in_padring, mio_in_padring } )
   );
 
+  // Software can enable the pinflip feature inside usbdev.
+  // The example hello_usbdev does this based on GPIO0 (a switch on the board)
+  //
+  // Here, we use the state of the DN pullup to effectively undo the
+  // swapping such that the PCB always sees the unflipped D+/D-. We
+  // could do the same inside the .xdc file but then two FPGA
+  // bitstreams would be needed for testing.
+  //
+  // dio_in/out/oe map is: PADS <- _padring <- JTAG mux -> _umux -> USB mux -> _core
+  localparam int DioIdxUsbDn0 = top_earlgrey_pkg::TopEarlgreyDioPinUsbdevDn;
+  localparam int DioIdxUsbDp0 = top_earlgrey_pkg::TopEarlgreyDioPinUsbdevDp;
+  localparam int DioIdxUsbDnPullup0 = top_earlgrey_pkg::TopEarlgreyDioPinUsbdevDnPullup;
+  localparam int DioIdxUsbDpPullup0 = top_earlgrey_pkg::TopEarlgreyDioPinUsbdevDpPullup;
+
+  // The output enable for IO_USB_DNPULLUP0 is used to decide whether we need to undo the swapping.
+  logic undo_swap;
+  assign undo_swap = dio_oe_core[DioIdxUsbDnPullup0];
+
+  for (genvar i = 0; i < padctrl_reg_pkg::NDioPads; i++) begin : gen_dio
+    if (i == DioIdxUsbDn0) begin
+      assign dio_out_umux[i] = undo_swap ? dio_out_core[DioIdxUsbDp0] :
+                                           dio_out_core[DioIdxUsbDn0];
+      assign dio_oe_umux[i]  = undo_swap ? dio_oe_core[DioIdxUsbDp0] :
+                                           dio_oe_core[DioIdxUsbDn0];
+      assign dio_in_core[i]  = undo_swap ? dio_in_umux[DioIdxUsbDp0] :
+                                           dio_in_umux[DioIdxUsbDn0];
+    end else if (i == DioIdxUsbDp0) begin
+      assign dio_out_umux[i] = undo_swap ? dio_out_core[DioIdxUsbDn0] :
+                                           dio_out_core[DioIdxUsbDp0];
+      assign dio_oe_umux[i]  = undo_swap ? dio_oe_core[DioIdxUsbDn0] :
+                                           dio_oe_core[DioIdxUsbDp0];
+      assign dio_in_core[i]  = undo_swap ? dio_in_umux[DioIdxUsbDn0] :
+                                           dio_in_umux[DioIdxUsbDp0];
+    end else if (i == DioIdxUsbDnPullup0) begin
+      assign dio_out_umux[i] = undo_swap ? dio_out_core[DioIdxUsbDpPullup0] :
+                                           dio_out_core[DioIdxUsbDnPullup0];
+      assign dio_oe_umux[i]  = undo_swap ? dio_oe_core[DioIdxUsbDpPullup0] :
+                                           dio_oe_core[DioIdxUsbDnPullup0];
+      assign dio_in_core[i]  = dio_in_umux[i];
+    end else if (i == DioIdxUsbDpPullup0) begin
+      assign dio_out_umux[i] = undo_swap ? dio_out_core[DioIdxUsbDnPullup0] :
+                                           dio_out_core[DioIdxUsbDpPullup0];
+      assign dio_oe_umux[i]  = undo_swap ? dio_oe_core[DioIdxUsbDnPullup0] :
+                                           dio_oe_core[DioIdxUsbDpPullup0];
+      assign dio_in_core[i]  = dio_in_umux[i];
+    end else begin
+      assign dio_out_umux[i] = dio_out_core[i];
+      assign dio_oe_umux[i]  = dio_oe_core[i];
+      assign dio_in_core[i]  = dio_in_umux[i];
+    end
+  end
+
   ////////////////////////////////
   // JTAG clock buffer for FPGA //
   ////////////////////////////////
diff --git a/hw/top_earlgrey/rtl/top_earlgrey_verilator.sv b/hw/top_earlgrey/rtl/top_earlgrey_verilator.sv
index 5cbbe11..44490dd 100644
--- a/hw/top_earlgrey/rtl/top_earlgrey_verilator.sv
+++ b/hw/top_earlgrey/rtl/top_earlgrey_verilator.sv
@@ -204,37 +204,33 @@
 
   // USB DPI
   usbdpi u_usbdpi (
-    .clk_i         (clk_i),
-    .rst_ni        (rst_ni),
-    .clk_48MHz_i   (clk_i),
-    .sense_p2d     (cio_usbdev_sense_p2d),
-    .pullup_d2p    (cio_usbdev_dp_pullup_d2p),
-    .pullup_en_d2p (cio_usbdev_dp_pullup_en_d2p),
-    .dp_p2d        (cio_usbdev_dp_p2d),
-    .dp_d2p        (cio_usbdev_dp_d2p),
-    .dp_en_d2p     (cio_usbdev_dp_en_d2p),
-    .dn_p2d        (cio_usbdev_dn_p2d),
-    .dn_d2p        (cio_usbdev_dn_d2p),
-    .dn_en_d2p     (cio_usbdev_dn_en_d2p)
+    .clk_i           (clk_i),
+    .rst_ni          (rst_ni),
+    .clk_48MHz_i     (clk_i),
+    .sense_p2d       (cio_usbdev_sense_p2d),
+    .pullupdp_d2p    (cio_usbdev_dp_pullup_d2p),
+    .pullupdp_en_d2p (cio_usbdev_dp_pullup_en_d2p),
+    .pullupdn_d2p    (cio_usbdev_dn_pullup_d2p),
+    .pullupdn_en_d2p (cio_usbdev_dn_pullup_en_d2p),
+    .dp_p2d          (cio_usbdev_dp_p2d),
+    .dp_d2p          (cio_usbdev_dp_d2p),
+    .dp_en_d2p       (cio_usbdev_dp_en_d2p),
+    .dn_p2d          (cio_usbdev_dn_p2d),
+    .dn_d2p          (cio_usbdev_dn_d2p),
+    .dn_en_d2p       (cio_usbdev_dn_en_d2p),
+    .d_p2d           (cio_usbdev_d_p2d),
+    .d_d2p           (cio_usbdev_d_d2p),
+    .d_en_d2p        (cio_usbdev_d_en_d2p),
+    .se0_d2p         (cio_usbdev_se0_d2p),
+    .se0_en_d2p      (cio_usbdev_se0_en_d2p),
+    .txmode_d2p      (cio_usbdev_tx_mode_se_d2p),
+    .txmode_en_d2p   (cio_usbdev_tx_mode_se_en_d2p)
   );
 
   // Tie off unused signals.
-  logic unused_cio_usbdev_se0_d2p, unused_cio_usbdev_se0_en_d2p;
-  logic unused_cio_usbdev_dn_pullup_d2p, unused_cio_usbdev_dn_pullup_en_d2p;
-  logic unused_cio_usbdev_tx_mode_se_d2p, unused_cio_usbdev_tx_mode_se_en_d2p;
   logic unused_cio_usbdev_suspend_d2p, unused_cio_usbdev_suspend_en_d2p;
-  logic unused_cio_usbdev_d_d2p, unused_cio_usbdev_d_en_d2p;
-  assign unused_cio_usbdev_se0_d2p = cio_usbdev_se0_d2p;
-  assign unused_cio_usbdev_se0_en_d2p = cio_usbdev_se0_en_d2p;
-  assign unused_cio_usbdev_dn_pullup_d2p = cio_usbdev_dn_pullup_d2p;
-  assign unused_cio_usbdev_dn_pullup_en_d2p = cio_usbdev_dn_pullup_en_d2p;
-  assign unused_cio_usbdev_tx_mode_se_d2p = cio_usbdev_tx_mode_se_d2p;
-  assign unused_cio_usbdev_tx_mode_se_en_d2p = cio_usbdev_tx_mode_se_en_d2p;
   assign unused_cio_usbdev_suspend_d2p = cio_usbdev_suspend_d2p;
   assign unused_cio_usbdev_suspend_en_d2p = cio_usbdev_suspend_en_d2p;
-  assign cio_usbdev_d_p2d = 1'b0;
-  assign unused_cio_usbdev_d_d2p = cio_usbdev_d_d2p;
-  assign unused_cio_usbdev_d_en_d2p = cio_usbdev_d_en_d2p;
 
   // monitor for termination
 `ifndef END_MON_PATH
diff --git a/hw/top_earlgrey/util/opentitan_earlgrey_usbdev_pin_config_sim.sh b/hw/top_earlgrey/util/opentitan_earlgrey_usbdev_pin_config_sim.sh
new file mode 100755
index 0000000..e1ede15
--- /dev/null
+++ b/hw/top_earlgrey/util/opentitan_earlgrey_usbdev_pin_config_sim.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Simulator executable
+VERILATOR=build/lowrisc_systems_top_earlgrey_verilator_0.1/sim-verilator/Vtop_earlgrey_verilator
+
+# Code to load
+ROMCODE=build-bin/sw/device/boot_rom/boot_rom_sim_verilator.elf
+FLASH=build-bin/sw/device/examples/hello_usbdev/hello_usbdev_sim_verilator.elf
+
+# Where simulator output or control fifos are put
+VFILE_DIR=.
+
+# How long to simulate
+SIM_CYCLES=700000
+
+# Expected output
+EXPECT_USB=hw/top_earlgrey/util/opentitan_earlgrey_usbdev_expected-usb.log
+EXPECT_UART=hw/top_earlgrey/util/opentitan_earlgrey_usbdev_expected-uart.log
+
+# Expected differences in output between noflip se and the others
+IGNORE_USB="-I Pullup.change"
+IGNORE_UART="-I PHY.settings"
+
+echo "Simulation with normal pins, singleended"
+$VERILATOR --meminit=rom,$ROMCODE --meminit=flash,$FLASH -c $SIM_CYCLES &
+sleep 1
+echo 'l01 l00' > $VFILE_DIR/gpio0-write && cat $VFILE_DIR/gpio0-read
+cp $VFILE_DIR/usb0.log $VFILE_DIR/usb-noflip-se.log
+cp $VFILE_DIR/uart0.log $VFILE_DIR/uart-noflip-se.log
+
+
+echo "Simulation with flipped pins, singleended"
+$VERILATOR --meminit=rom,$ROMCODE --meminit=flash,$FLASH -c $SIM_CYCLES &
+sleep 1
+echo 'l01 h00' > $VFILE_DIR/gpio0-write && cat $VFILE_DIR/gpio0-read
+cp $VFILE_DIR/usb0.log $VFILE_DIR/usb-flip-se.log
+cp $VFILE_DIR/uart0.log $VFILE_DIR/uart-flip-se.log
+
+echo "Simulation with normal pins, differential"
+$VERILATOR --meminit=rom,$ROMCODE --meminit=flash,$FLASH -c $SIM_CYCLES &
+sleep 1
+echo 'h01 l00' > $VFILE_DIR/gpio0-write && cat $VFILE_DIR/gpio0-read
+cp $VFILE_DIR/usb0.log $VFILE_DIR/usb-noflip-diff.log
+cp $VFILE_DIR/uart0.log $VFILE_DIR/uart-noflip-diff.log
+
+echo "Simulation with flipped pins, differential"
+$VERILATOR --meminit=rom,$ROMCODE --meminit=flash,$FLASH -c $SIM_CYCLES &
+sleep 1
+echo 'h01 h00' > $VFILE_DIR/gpio0-write && cat $VFILE_DIR/gpio0-read
+cp $VFILE_DIR/usb0.log $VFILE_DIR/usb-flip-diff.log
+cp $VFILE_DIR/uart0.log $VFILE_DIR/uart-flip-diff.log
+
+echo "Check No Flip Single Ended against expected logs"
+diff $VFILE_DIR/usb-noflip-se.log $EXPECT_USB
+diff $VFILE_DIR/uart-noflip-se.log $EXPECT_UART
+
+echo "Check Flipped Single Ended against No Flip Single Ended"
+diff $IGNORE_USB $VFILE_DIR/usb-flip-se.log $VFILE_DIR/usb-noflip-se.log
+diff $IGNORE_UART $VFILE_DIR/uart-flip-se.log $VFILE_DIR/uart-noflip-se.log
+
+echo "Check No Flip differential against No Flip Single Ended"
+diff $IGNORE_USB $VFILE_DIR/usb-noflip-diff.log $VFILE_DIR/usb-noflip-se.log
+diff $IGNORE_UART $VFILE_DIR/uart-noflip-diff.log $VFILE_DIR/uart-noflip-se.log
+
+echo "Check Flipped differential against No Flip Single Ended"
+diff $IGNORE_USB $VFILE_DIR/usb-flip-diff.log $VFILE_DIR/usb-noflip-se.log
+diff $IGNORE_UART $VFILE_DIR/uart-flip-diff.log $VFILE_DIR/uart-noflip-se.log
diff --git a/sw/device/examples/hello_usbdev/hello_usbdev.c b/sw/device/examples/hello_usbdev/hello_usbdev.c
index 3a5ff5f..fcacfef 100644
--- a/sw/device/examples/hello_usbdev/hello_usbdev.c
+++ b/sw/device/examples/hello_usbdev/hello_usbdev.c
@@ -84,6 +84,25 @@
   ++usb_chars_recved_total;
 }
 
+/**
+ * USB Send String
+ *
+ * Send a 0 terminated string to the USB one byte at a time.
+ * The send byte code will flush the endpoint if needed.
+ *
+ * @param string Zero terminated string to send.
+ * @param ss_ctx Pointer to simple string endpoint context to send through.
+ */
+static void usb_send_str(const char *string, usb_ss_ctx_t *ss_ctx) {
+  for (int i = 0; string[i] != 0; ++i) {
+    usb_simpleserial_send_byte(ss_ctx, string[i]);
+  }
+}
+
+// These GPIO bits control USB PHY configuration
+static const uint32_t kPinflipMask = 1;
+static const uint32_t kDiffMask = 2;
+
 int main(int argc, char **argv) {
   CHECK(dif_uart_init(
             (dif_uart_params_t){
@@ -131,7 +150,13 @@
   // Call `usbdev_init` here so that DPI will not start until the
   // simulation has finished all of the printing, which takes a while
   // if `--trace` was passed in.
-  usbdev_init(&usbdev);
+  uint32_t gpio_state;
+  CHECK(dif_gpio_read_all(&gpio, &gpio_state) == kDifGpioOk);
+  bool pinflip = gpio_state & kPinflipMask ? true : false;
+  bool differential = gpio_state & kDiffMask ? true : false;
+  LOG_INFO("PHY settings: pinflip=%d differential=%d", pinflip, differential);
+  usbdev_init(&usbdev, pinflip, differential, differential);
+
   usb_controlep_init(&usbdev_control, &usbdev, 0, config_descriptors,
                      sizeof(config_descriptors));
   usb_simpleserial_init(&simple_serial0, &usbdev, 1, usb_receipt_callback_0);
@@ -140,7 +165,7 @@
   CHECK(dif_spi_device_send(&spi, "SPI!", 4, /*bytes_sent=*/NULL) ==
         kDifSpiDeviceOk);
 
-  uint32_t gpio_state = 0;
+  bool say_hello = true;
   bool pass_signaled = false;
   while (true) {
     usbdev_poll(&usbdev);
@@ -170,7 +195,10 @@
         usb_simpleserial_send_byte(&simple_serial1, rcv_char + 1);
       }
     }
-
+    if (say_hello && usb_chars_recved_total > 2) {
+      usb_send_str("Hello USB World!!!!", &simple_serial0);
+      say_hello = false;
+    }
     // Signal that the simulation succeeded.
     if (usb_chars_recved_total >= kExpectedUsbCharsRecved && !pass_signaled) {
       LOG_INFO("PASS!");
diff --git a/sw/device/lib/usbdev.c b/sw/device/lib/usbdev.c
index 9eb7dc5..0341644 100644
--- a/sw/device/lib/usbdev.c
+++ b/sw/device/lib/usbdev.c
@@ -255,7 +255,7 @@
   }
 }
 
-void usbdev_init(usbdev_ctx_t *ctx) {
+void usbdev_init(usbdev_ctx_t *ctx, bool pinflip, bool diff_rx, bool diff_tx) {
   // setup context
   for (int i = 0; i < NUM_ENDPOINTS; i++) {
     usbdev_endpoint_setup(ctx, i, 0, NULL, NULL, NULL, NULL, NULL);
@@ -273,5 +273,11 @@
   REG32(USBDEV_RXENABLE_SETUP()) = (1 << USBDEV_RXENABLE_SETUP_SETUP_0);
   REG32(USBDEV_RXENABLE_OUT()) = (1 << USBDEV_RXENABLE_OUT_OUT_0);
 
+  uint32_t phy_config = (pinflip << USBDEV_PHY_CONFIG_PINFLIP) |
+                        (diff_rx << USBDEV_PHY_CONFIG_RX_DIFFERENTIAL_MODE) |
+                        (diff_tx << USBDEV_PHY_CONFIG_TX_DIFFERENTIAL_MODE) |
+                        (1 << USBDEV_PHY_CONFIG_EOP_SINGLE_BIT);
+  REG32(USBDEV_PHY_CONFIG()) = phy_config;
+
   REG32(USBDEV_USBCTRL()) = (1 << USBDEV_USBCTRL_ENABLE);
 }
diff --git a/sw/device/lib/usbdev.h b/sw/device/lib/usbdev.h
index 4e4c19a..7c0f5b2 100644
--- a/sw/device/lib/usbdev.h
+++ b/sw/device/lib/usbdev.h
@@ -5,6 +5,7 @@
 #ifndef OPENTITAN_SW_DEVICE_LIB_USBDEV_H_
 #define OPENTITAN_SW_DEVICE_LIB_USBDEV_H_
 
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -221,8 +222,11 @@
  * Initialize the usbdev interface
  *
  * @param ctx uninitialized usbdev context pointer
+ * @param pinflip boolean to indicate if PHY should be configured for D+/D- flip
+ * @param diff_rx boolean to indicate if PHY uses differential RX
+ * @param diff_tx boolean to indicate if PHY uses differential TX
  */
-void usbdev_init(usbdev_ctx_t *ctx);
+void usbdev_init(usbdev_ctx_t *ctx, bool pinflip, bool diff_rx, bool diff_tx);
 
 // Used for tracing what is going on. This may impact timing which is critical
 // when simulating with the USB DPI module.
diff --git a/sw/device/tests/usbdev_test.c b/sw/device/tests/usbdev_test.c
index 409bb3e..d4ef4e1 100644
--- a/sw/device/tests/usbdev_test.c
+++ b/sw/device/tests/usbdev_test.c
@@ -89,7 +89,8 @@
   // Call `usbdev_init` here so that DPI will not start until the
   // simulation has finished all of the printing, which takes a while
   // if `--trace` was passed in.
-  usbdev_init(&usbdev);
+  usbdev_init(&usbdev, /* pinflip= */ false, /* rx_diff= */ false,
+              /* tx_diff= */ false);
   usb_controlep_init(&usbdev_control, &usbdev, 0, config_descriptors,
                      sizeof(config_descriptors));
   usb_simpleserial_init(&simple_serial, &usbdev, 1, usb_receipt_callback);