[sw/crt] Initialise Global Pointer Correctly

`__global_pointer$` is used by GNU ld during linker relaxation to
represent the value in `gp`, so that some accesses can have their
addresses materialised directly in the load/store instruction, rather
than having to do `lui; addi` or similair.

The real payoff of this relaxation is for loads and stores of small
globals. For this reason, we want to ensure that the small data sections
come before their respective regular sections, so they have a higher
probability of landing within a 12-bit signed immediate of
`__global_pointer$`. This commit reorders these sections and describes
why.

Signed-off-by: Sam Elliott <selliott@lowrisc.org>
diff --git a/sw/device/boot_rom/rom_crt.S b/sw/device/boot_rom/rom_crt.S
index 2009f91..6713de5 100644
--- a/sw/device/boot_rom/rom_crt.S
+++ b/sw/device/boot_rom/rom_crt.S
@@ -54,6 +54,13 @@
   // Set up the stack.
   la  sp, _stack_start
 
+  // Set up the global pointer. This requires that we disable linker relaxations
+  // (or it will be relaxed to `mv gp, gp`).
+  .option push
+  .option norelax
+  la  gp, __global_pointer$
+  .option pop
+
   // Explicit fall-through to |_start|.
 
 /**
diff --git a/sw/device/boot_rom/rom_link.ld b/sw/device/boot_rom/rom_link.ld
index 59f93a3..dfb328c 100644
--- a/sw/device/boot_rom/rom_link.ld
+++ b/sw/device/boot_rom/rom_link.ld
@@ -100,9 +100,19 @@
    */
   .data ORIGIN(ram): AT(_data_init_start) ALIGN(4) {
     _data_start = .;
+    __global_pointer$ = . + 2048;
+
+    /* Small data should come before larger data. This helps to ensure small
+     * globals are within 2048 bytes of the value of `gp`, making their accesses
+     * hopefully only take one instruction. */
+    *(.sdata)
+    *(.sdata.*)
+
+    /* Other data will likely need multiple instructions to load, so we're less
+     * concerned about address materialisation taking more than one instruction.
+     */
     *(.data)
     *(.data.*)
-    *(.sdata)
     _data_end = .;
   } > ram
 
@@ -118,10 +128,12 @@
    */
   .bss : ALIGN(4) {
     _bss_start = .;
-    *(.bss)
-    *(.bss.*)
+    /* Small BSS comes before regular BSS for the same reasons as in the data
+     * section */
     *(.sbss)
     *(.sbss.*)
+    *(.bss)
+    *(.bss.*)
     *(COMMON)
     _bss_end = .;
   } > ram
diff --git a/sw/device/exts/common/flash_crt.S b/sw/device/exts/common/flash_crt.S
index c082356..0ffac3b 100644
--- a/sw/device/exts/common/flash_crt.S
+++ b/sw/device/exts/common/flash_crt.S
@@ -25,6 +25,13 @@
   // jumps here will have the correct stack start linked in.
   la sp, _stack_start
 
+  // Set up the global pointer. This requires that we disable linker relaxations
+  // (or it will be relaxed to `mv gp, gp`).
+  .option push
+  .option norelax
+  la gp, __global_pointer$
+  .option pop
+
   // Set up the new interrupt vector.
   la   t0, _vectors_start
   csrw mtvec, t0
diff --git a/sw/device/exts/common/flash_link.ld b/sw/device/exts/common/flash_link.ld
index 3e41e5c..49c09f3 100644
--- a/sw/device/exts/common/flash_link.ld
+++ b/sw/device/exts/common/flash_link.ld
@@ -92,9 +92,19 @@
    */ 
   .data ORIGIN(ram): AT(_data_init_start) ALIGN(4) {
     _data_start = .;
+    __global_pointer$ = . + 2048;
+
+    /* Small data should come before larger data. This helps to ensure small
+     * globals are within 2048 bytes of the value of `gp`, making their accesses
+     * hopefully only take one instruction. */
+    *(.sdata)
+    *(.sdata.*)
+
+    /* Other data will likely need multiple instructions to load, so we're less
+     * concerned about address materialisation taking more than one instruction.
+     */
     *(.data)
     *(.data.*)
-    *(.sdata)
     _data_end = .;
   } > ram
 
@@ -103,10 +113,12 @@
    */
   .bss : ALIGN(4) {
     _bss_start = .;
-    *(.bss)
-    *(.bss.*)
+    /* Small BSS comes before regular BSS for the same reasons as in the data
+     * section */
     *(.sbss)
     *(.sbss.*)
+    *(.bss)
+    *(.bss.*)
     *(COMMON)
     _bss_end = .;
   } > ram