[sw/crt] Fix CRT code clobbering the first word of .bss

When this assembly was written, an incorrect assumption was made that
the _*_end pointers pointed to the final word in a section, when in
reality, they point one past. Due to how the linker laid out .data and
.bss, .data initialization would clobber the first word of .bss.

Signed-off-by: Miguel Young de la Sota <mcyoung@google.com>
diff --git a/sw/device/boot_rom/rom_crt.S b/sw/device/boot_rom/rom_crt.S
index 41ffaea..33c16e1 100644
--- a/sw/device/boot_rom/rom_crt.S
+++ b/sw/device/boot_rom/rom_crt.S
@@ -82,21 +82,21 @@
 bss_zero_loop:
   sw    zero, 0(t0)
   addi  t0, t0, 0x4
-  ble   t0, t1, bss_zero_loop
+  blt   t0, t1, bss_zero_loop
 bss_zero_loop_end:
 
   // Zero out the stack
   //
   // We use `t0` and `t1` to represent the start and end pointers of the stack.
   // As the stack grows downwards and we zero going forwards the start pointer
-  // starts as _stack_end and the end pointer at _stack_start - 4
+  // starts as _stack_end and the end pointer at _stack_start
   la  t0, _stack_end
-  la  t1, (_stack_start - 4)
+  la  t1, _stack_start
   bge t0, t1, stack_zero_loop_end
 stack_zero_loop:
   sw    zero, 0(t0)
   addi  t0, t0, 0x4
-  ble   t0, t1, stack_zero_loop
+  blt   t0, t1, stack_zero_loop
 stack_zero_loop_end:
 
   // Initialize the `.data` segment from the `.idata` segment.
@@ -114,7 +114,7 @@
   sw   t3, 0(t0)
   addi t0, t0, 0x4
   addi t2, t2, 0x4
-  ble  t0, t1, data_copy_loop
+  blt  t0, t1, data_copy_loop
 data_copy_loop_end:
 
   // Re-clobber all of the registers from above.
diff --git a/sw/device/boot_rom/rom_link.ld b/sw/device/boot_rom/rom_link.ld
index cf6b334..c7b6127 100644
--- a/sw/device/boot_rom/rom_link.ld
+++ b/sw/device/boot_rom/rom_link.ld
@@ -116,6 +116,7 @@
      */
     *(.data)
     *(.data.*)
+    . = ALIGN(4);
     _data_end = .;
   } > ram_main
 
@@ -138,6 +139,7 @@
     *(.bss)
     *(.bss.*)
     *(COMMON)
+    . = ALIGN(4);
     _bss_end = .;
   } > ram_main
 
diff --git a/sw/device/exts/common/flash_crt.S b/sw/device/exts/common/flash_crt.S
index 84124f3..11faeea 100644
--- a/sw/device/exts/common/flash_crt.S
+++ b/sw/device/exts/common/flash_crt.S
@@ -48,21 +48,21 @@
 bss_zero_loop:
   sw    zero, 0(t0)
   addi  t0, t0, 0x4
-  ble   t0, t1, bss_zero_loop
+  blt   t0, t1, bss_zero_loop
 bss_zero_loop_end:
 
   // Zero out the stack
   //
   // We use `t0` and `t1` to represent the start and end pointers of the stack.
   // As the stack grows downwards and we zero going forwards the start pointer
-  // starts as _stack_end and the end pointer at _stack_start - 4
+  // starts as _stack_end and the end pointer at _stack_start.
   la  t0, _stack_end
-  la  t1, (_stack_start - 4)
+  la  t1, _stack_start
   bge t0, t1, stack_zero_loop_end
 stack_zero_loop:
   sw    zero, 0(t0)
   addi  t0, t0, 0x4
-  ble   t0, t1, stack_zero_loop
+  blt   t0, t1, stack_zero_loop
 stack_zero_loop_end:
 
   // Initialize the `.data` segment from the `.idata` segment.
@@ -80,7 +80,7 @@
   sw   t3, 0(t0)
   addi t0, t0, 0x4
   addi t2, t2, 0x4
-  ble  t0, t1, data_copy_loop
+  blt  t0, t1, data_copy_loop
 data_copy_loop_end:
 
   // Jump into the C program entry point. This is your standard
diff --git a/sw/device/exts/common/flash_link.ld b/sw/device/exts/common/flash_link.ld
index 59e8c2d..dfe1fde 100644
--- a/sw/device/exts/common/flash_link.ld
+++ b/sw/device/exts/common/flash_link.ld
@@ -112,6 +112,7 @@
      */
     *(.data)
     *(.data.*)
+    . = ALIGN(4);
     _data_end = .;
   } > ram_main
 
@@ -127,6 +128,7 @@
     *(.bss)
     *(.bss.*)
     *(COMMON)
+    . = ALIGN(4);
     _bss_end = .;
   } > ram_main