Switch addition to vdmulh, fix rounding

- Now that the vdmulh bug was corrected in sim, we can use it instead of
  vmulh + vmul
- Enable rounding during the shift in our rescale macro, this fixes
  off-by-one errors seen in Soundstream's use of Add.
- clang-format the rescale macro

Change-Id: I01c11bf19a475a6768fab74cde3a4b6ec64d8c14
diff --git a/tflm/opt/util.h b/tflm/opt/util.h
index 8f9d079..1560c32 100644
--- a/tflm/opt/util.h
+++ b/tflm/opt/util.h
@@ -12,12 +12,11 @@
 #define RIGHT_SHIFT(_shift) -std::min(_shift, 0L)
 
 #define rescale_internal(Vd, Vs, mult, shift, offset, m) \
-  do { \
-    int32_t _shift = RIGHT_SHIFT(shift); \
-    vmulh_w_r_vx##m(Vd, Vs, mult); \
-    vmul_w_vx##m(Vd, Vd, 2); \
-    vsha_w_vx##m(Vd, Vd, _shift); \
-    vadd_w_vx##m(Vd, Vd, offset); \
+  do {                                                   \
+    int32_t _shift = RIGHT_SHIFT(shift);                 \
+    vdmulh_w_r_vx##m(Vd, Vs, mult);                      \
+    vsha_w_r_vx##m(Vd, Vd, _shift);                      \
+    vadd_w_vx##m(Vd, Vd, offset);                        \
   } while (0);
 
 #define rescale(Vd, Vs, mult, shift, offset) rescale_internal(Vd, Vs, mult, shift, offset, );