Merge remote-tracking branch 'spacebeaker/upstream' into update

Bypass-Presubmit-Reason: no CI sencha tests

Change-Id: I759201bf678fc8dd5f6b1f7e06f8a89078a7d4ae
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 39fcd51..5a38f98 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -2,7 +2,7 @@
   "image": "ghcr.io/cheriot-platform/devcontainer:latest",
   "remoteUser": "cheriot",
   "containerUser": "cheriot",
-  "onCreateCommand": "git submodule init && git submodule update && cd tests && xmake f --sdk=/cheriot-tools/ && xmake project -k compile_commands .. && cd .. && for I in examples/[[:digit:]]* ; do echo $I ; cd $I ; xmake f --sdk=/cheriot-tools/ && xmake project -k compile_commands . && cd ../.. ; done",
+  "onCreateCommand": "git config --global --add safe.directory /workspaces/cheriot-rtos && git submodule init && git submodule update && cd tests && xmake f --sdk=/cheriot-tools/ && xmake project -k compile_commands .. && cd .. && for I in ex*/[[:digit:]]* ; do echo $I ; cd $I ; xmake f --sdk=/cheriot-tools/ && xmake project -k compile_commands . && cd ../.. ; done",
   "customizations": {
     "vscode": {
       "extensions": [
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 660447c..25cddb5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -5,6 +5,7 @@
     branches: [ main ]
   pull_request:
     branches: [ main ]
+  merge_group:
   workflow_dispatch:
 
 jobs:
@@ -17,7 +18,7 @@
           - build-type: debug
             build-flags: --debug-loader=y --debug-scheduler=y --debug-allocator=y -m debug
           - build-type: release
-            build-flags: --debug-loader=n --debug-scheduler=n --debug-allocator=n -m release
+            build-flags: --debug-loader=n --debug-scheduler=n --debug-allocator=n -m release --stack-usage-check-allocator=y --stack-usage-check-scheduler=y
       fail-fast: false
     runs-on: ubuntu-latest
     container:
diff --git a/.gitmodules b/.gitmodules
index 88cbeef..2bc1dc4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,6 +4,3 @@
 [submodule "sdk/third_party/microvium"]
 	path = sdk/third_party/microvium
 	url = https://github.com/coder-mike/microvium
-[submodule "sdk/third_party/freertos-plus-tcp"]
-	path = sdk/third_party/freertos-plus-tcp
-	url = https://github.com/FreeRTOS/FreeRTOS-Plus-TCP.git
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index f9ba8cf..686e5e7 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -7,3 +7,4 @@
 - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
+- Employees can reach out at [aka.ms/opensource/moderation-support](https://aka.ms/opensource/moderation-support)
diff --git a/benchmarks/allocation/alloc.cc b/benchmarks/allocation/alloc.cc
index 7185f86..4f41652 100644
--- a/benchmarks/allocation/alloc.cc
+++ b/benchmarks/allocation/alloc.cc
@@ -31,7 +31,7 @@
 		}
 		auto end = rdcycle();
 		printf(__XSTRING(BOARD) "\t%ld\t%ld\n", static_cast<int>(size), end - start);
-		size_t quota = heap_quota_remaining(MALLOC_CAPABILITY);
+		auto quota = heap_quota_remaining(MALLOC_CAPABILITY);
 		Debug::Invariant(quota == MALLOC_QUOTA, "Quota remaining {}, should be {}", quota, MALLOC_QUOTA);
 		Debug::log("Flushing quarantine");
 		heap_quarantine_empty();
diff --git a/docs/Allocator.md b/docs/Allocator.md
index 0006c32..96f3c5f 100644
--- a/docs/Allocator.md
+++ b/docs/Allocator.md
@@ -52,7 +52,7 @@
 These functions will fail if the allocator capability does not have sufficient remaining quota to handle the allocation (or if the allocator itself is out of memory).
 All allocations have an eight-byte header and this counts towards the quota, so the total quota required is the sum of the size of all objects plus eight times the number of live objects.
 
-The amount of quota remaining in a allocator capability can be queried with `heap_quota_remaining`.
+The amount of quota remaining in an allocator capability can be queried with `heap_quota_remaining`.
 
 The `heap_free` function deallocates memory.
 This must be called with the same allocator capability that allocated the memory (you may not free memory unless authorised to do so).
@@ -96,3 +96,52 @@
 The C++ `new` / `delete` functions wrap `malloc` and friends.
 These can be hidden by defining the `CHERIOT_NO_NEW_DELETE` macro.
 
+Handling of failure
+-------------------
+
+All `heap_` allocator APIs can fail with `-ENOTENOUGHSTACK`, which indicates that the stack space available was insufficient for the allocator to safely execute.
+This is because executing the allocator on insufficient stack space may allow attackers to trigger arbitrary failures in the allocator through carefully setting the size of the stack.
+
+This implies an important difference of semantics between `heap_allocate` and the C stdlib `malloc`: upon failure `heap_allocate` may return a `(void*) -ENOTENOUGHSTACK` pointer as well as a `NULL` pointer.
+Success of `heap_allocate` must thus be checked by querying the tag bit of the returned capability, instead of comparing with a `NULL` pointer.
+
+A correct usage of `heap_allocate` with the C++ API looks like the following:
+```C++
+Timeout t{1};
+Capability ptr{heap_allocate(&t, MALLOC_CAPABILITY, sizeof(int))};
+// Correct check
+if (!ptr.is_valid())
+{
+    // ...
+}
+```
+
+Using the C-level API, a correct check is:
+```C
+Timeout t   = {0, UnlimitedTimeout};
+void *ptr = heap_allocate(&t, MALLOC_CAPABILITY, sizeof(int));
+// Correct check
+if (!__builtin_cheri_tag_get(ptr))
+{
+    // ...
+}
+```
+
+As opposed to an incorrect `NULL` check, which would fail if the return value is `-ENOTENOUGHSTACK`:
+```C++
+Timeout t{1};
+void *ptr = heap_allocate(&t, MALLOC_CAPABILITY, sizeof(int));
+// Incorrect, this will go wrong if we get (void*) ENOTENOUGHSTACK
+if (ptr == nullptr)
+{
+    // ...
+}
+```
+
+Similarly, the semantics of `heap_free` differ from those of the C stdlib `free`.
+Whereas the stdlib `free` does not report failure (e.g., if passed pointer is not a heap-allocated pointer, or if the function runs out of stack), `heap_free` returns an errno value.
+
+`heap_free` may fail if the stack is insufficient, if the heap capability cannot be used to free the capability, or if the pointer has already been freed, among others.
+Still - provided a valid heap-allocated pointer, `heap_free` should only fail if the stack space is insufficient.
+`heap_can_free` can be used to check if a call to `heap_free` would succeed without actually performing the free.
+This is useful to ensure that a failure to free will not happen at a point of no return (e.g., when resources have already been partially freed).
diff --git a/docs/GettingStarted.md b/docs/GettingStarted.md
index 1aba7fc..d115e68 100644
--- a/docs/GettingStarted.md
+++ b/docs/GettingStarted.md
@@ -237,6 +237,10 @@
 We will add documentation for this part later.
 In the meantime, our [blog post](https://cheriot.org/fpga/try/2023/11/16/cheriot-on-the-arty-a7.html) provides pointers on how to do this.
 
+There are two default clock speeds that SAFE configuration can be synthesised at following the build instructions, 20MHz and 33MHz.
+The expected clock speed for the Arty A7 board in this project is 33MHz, but can be changed if the FPGA design is running at a different clock speed.
+If you want to use the 20MHz, or any other clock speed, implementation, please update the *timer_hz* field in the board file `sdk/boards/ibex-arty-a7-100.json`
+
 ### Building, Copying, and Running the Firmware
 
 We have now configured the FPGA.
diff --git a/examples/04.temporal_safety/README.md b/examples/04.temporal_safety/README.md
index e326c7e..1079d64 100644
--- a/examples/04.temporal_safety/README.md
+++ b/examples/04.temporal_safety/README.md
@@ -15,7 +15,7 @@
 The first value is the address, which is the same in both.
 Everything else, shown in brackets, is the *capability metadata*.
 
-The only field that changes after three call to `free` is the *tag* or *valid* bit.
+The only field that changes after the call to `free` is the *tag* or *valid* bit.
 This is the bit that indicates whether this is a valid capability.
 When this bit is cleared, the CPU will not permit this pointer to be used to authorise any operations.
 Its fields can still be read, which is why the other lines are the same.
diff --git a/examples/08.memory_safety/README.md b/examples/08.memory_safety/README.md
index 4eb367e..9c9151f 100644
--- a/examples/08.memory_safety/README.md
+++ b/examples/08.memory_safety/README.md
@@ -9,8 +9,8 @@
 ## Spatial safety
 
 By "spatial safety" we mean that every memory allocation has a well-defined size/bounds, and it's incorrect to access memory outside of these bounds. 
-Every C/C++ pointer in our system is a bounded, unforgeable CHERI capability.Every C/C++ pointer in our system is a bounded unforgeable CHERI capability.
-Any dereference Any dereference outside of an allocation's bounds, whether adjacent to the allocation or far outside, will trap. of an allocation's bounds, whether adjacent to the allocation or far outside, will trap.
+Every C/C++ pointer in our system is a bounded, unforgeable CHERI capability.
+Any dereference outside of an allocation's bounds, whether adjacent to the allocation or far outside, will trap. of an allocation's bounds, whether adjacent to the allocation or far outside, will trap.
 This is enforced by the hardware and so the same guarantees apply to accesses from assembly code.
 This property is enforced at the architectural level.
 You can see examples for that in the cases for `StackLinearOverflow`, `HeapLinearOverflow`, and `HeapNonlinearOverflow`, [implemented](memory_safety_inner.cc:49) in the inner compartment.
diff --git a/exercises/01.compartmentalisation/microvium-ffi.hh b/exercises/01.compartmentalisation/microvium-ffi.hh
index 02818ee..a919a17 100644
--- a/exercises/01.compartmentalisation/microvium-ffi.hh
+++ b/exercises/01.compartmentalisation/microvium-ffi.hh
@@ -376,6 +376,7 @@
 	                                     mvm_Value         *args,
 	                                     uint8_t            argCount)
 	{
+		auto *uart = MMIO_CAPABILITY(Uart, uart);
 		// Iterate over the arguments.
 		for (unsigned i = 0; i < argCount; i++)
 		{
@@ -384,11 +385,11 @@
 			// Write each character to the UART
 			for (; *str != '\0'; str++)
 			{
-				ImplicitUARTOutput::write(*str);
+				uart->blocking_write(*str);
 			}
 		}
 		// Write a trailing newline
-		ImplicitUARTOutput::write('\n');
+		uart->blocking_write('\n');
 		// Unconditionally return success
 		return MVM_E_SUCCESS;
 	}
diff --git a/exercises/01.compartmentalisation/xmake.lua b/exercises/01.compartmentalisation/xmake.lua
index b879cc5..0534b24 100644
--- a/exercises/01.compartmentalisation/xmake.lua
+++ b/exercises/01.compartmentalisation/xmake.lua
@@ -24,6 +24,7 @@
 firmware("javascript")
     add_deps("crt", "freestanding", "string", "microvium", "atomic_fixed")
     add_deps("js")
+    add_deps("debug")
     on_load(function(target)
         target:values_set("board", "$(board)")
         target:values_set("threads", {
diff --git a/scripts/run-sonata.sh b/scripts/run-sonata.sh
new file mode 100755
index 0000000..fc4238f
--- /dev/null
+++ b/scripts/run-sonata.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+set -e
+
+FIRMWARE_ELF=$1
+
+SCRIPT_DIRECTORY="$(dirname "$(realpath "$0")")"
+. ${SCRIPT_DIRECTORY}/includes/helper_find_llvm_install.sh
+
+OBJCOPY=$(find_llvm_tool_required llvm-objcopy)
+
+command -v uf2conv > /dev/null
+if [ ! $? ] ; then
+	echo "uf2conv not found.  On macOS / Linux systems with Python3 installed, you can install it with:"
+	echo "python3 -m pip install --pre -U git+https://github.com/makerdiary/uf2utils.git@main"
+fi
+
+# Convert the ELF file to a binary file
+${OBJCOPY} -O binary ${FIRMWARE_ELF} ${FIRMWARE_ELF}.bin
+# Convert the binary to a UF2 (Microsoft USB Flashing Format) file
+uf2conv ${FIRMWARE_ELF}.bin -b0x00101000 -co ${FIRMWARE_ELF}.uf2
+
+# Try to copy the firmware to the SONATA drive, if we can find one.
+try_copy()
+{
+	if [ -f $1/SONATA/OPTIONS.TXT ] ; then
+		cp ${FIRMWARE_ELF}.uf2 $1/SONATA/firmware.uf2
+		echo "Firmware copied to $1/SONATA/"
+		exit
+	fi
+}
+
+# Try some common mount points
+try_copy /Volumes/
+try_copy /run/media/$USER/
+try_copy /run/media/
+try_copy /mnt/
+
+cp ${FIRMWARE_ELF}.uf2 firmware.uf2
+
+echo "Please copy $(pwd)/firmware.uf2 to the SONATA drive to load."
diff --git a/sdk/boards/sonata.json b/sdk/boards/sonata.json
new file mode 100644
index 0000000..7d8433a
--- /dev/null
+++ b/sdk/boards/sonata.json
@@ -0,0 +1,55 @@
+{
+    "devices": {
+        "shadow" : {
+            "start" : 0x30000000,
+            "end"   : 0x30004000
+        },
+        "clint": {
+            "start" : 0x80040000,
+            "end"   : 0x80050000
+        },
+        "uart": {
+            "start" : 0x80100000,
+            "end"   : 0x80100034
+        },
+        "gpio" : {
+            "start" : 0x80000000,
+            "end"   : 0x80000020
+        },
+        "plic": {
+            "start" : 0x88000000,
+            "end"   : 0x88400000
+        }
+    },
+    "instruction_memory": {
+        "start": 0x00101000,
+        "end":   0x00140000
+    },
+    "heap": {
+        "end": 0x00140000
+    },
+    "revokable_memory_start": 0x00100000,
+    "interrupts": [
+        {
+            "name": "UartRxWatermark",
+            "number": 2,
+            "priority": 3,
+            "edge_triggered": true
+        }
+    ],
+    "defines" : [
+        "IBEX",
+        "SUNBURST",
+        "SUNBURST_SHADOW_BASE=0x30000000",
+        "SUNBURST_SHADOW_SIZE=0x4000"
+    ],
+    "driver_includes" : [
+        "../include/platform/sunburst",
+        "../include/platform/generic-riscv"
+    ],
+    "timer_hz" : 30000000,
+    "tickrate_hz" : 100,
+    "revoker" : "software",
+    "simulator" : "${sdk}/../scripts/run-sonata.sh",
+    "simulation": false
+}
diff --git a/sdk/core/allocator/alloc_config.h b/sdk/core/allocator/alloc_config.h
index a032fd2..522ed8a 100644
--- a/sdk/core/allocator/alloc_config.h
+++ b/sdk/core/allocator/alloc_config.h
@@ -23,3 +23,16 @@
 
 constexpr size_t MallocAlignment = 1U << MallocAlignShift;
 constexpr size_t MallocAlignMask = MallocAlignment - 1;
+
+constexpr StackCheckMode StackMode =
+#if CHERIOT_STACK_CHECKS_ALLOCATOR
+  StackCheckMode::Asserting
+#else
+  StackCheckMode::Disabled
+// Uncomment if checks failed to find the correct values
+// StackCheckMode::Logging
+#endif
+  ;
+
+#define STACK_CHECK(expected)                                                  \
+	StackUsageCheck<StackMode, expected, __PRETTY_FUNCTION__> stackCheck
diff --git a/sdk/core/allocator/main.cc b/sdk/core/allocator/main.cc
index a7364b6..29a652b 100644
--- a/sdk/core/allocator/main.cc
+++ b/sdk/core/allocator/main.cc
@@ -803,8 +803,10 @@
 
 } // namespace
 
-size_t heap_quota_remaining(struct SObjStruct *heapCapability)
+__cheriot_minimum_stack(0x80) ssize_t
+  heap_quota_remaining(struct SObjStruct *heapCapability)
 {
+	STACK_CHECK(0x80);
 	LockGuard g{lock};
 	auto     *cap = malloc_capability_unseal(heapCapability);
 	if (cap == nullptr)
@@ -814,8 +816,9 @@
 	return cap->quota;
 }
 
-void heap_quarantine_empty()
+__cheriot_minimum_stack(0xb0) void heap_quarantine_empty()
 {
+	STACK_CHECK(0xb0);
 	LockGuard g{lock};
 	while (gm->heapQuarantineSize > 0)
 	{
@@ -829,8 +832,11 @@
 	}
 }
 
-void *heap_allocate(Timeout *timeout, SObj heapCapability, size_t bytes)
+__cheriot_minimum_stack(0x1f0) void *heap_allocate(Timeout *timeout,
+                                                   SObj     heapCapability,
+                                                   size_t   bytes)
 {
+	STACK_CHECK(0x1f0);
 	if (!check_timeout_pointer(timeout))
 	{
 		return nullptr;
@@ -850,8 +856,10 @@
 	return malloc_internal(bytes, std::move(g), cap, timeout);
 }
 
-size_t heap_claim(SObj heapCapability, void *pointer)
+__cheriot_minimum_stack(0x1b0) ssize_t
+  heap_claim(SObj heapCapability, void *pointer)
 {
+	STACK_CHECK(0x1b0);
 	LockGuard g{lock};
 	auto     *cap = malloc_capability_unseal(heapCapability);
 	if (cap == nullptr)
@@ -878,14 +886,18 @@
 	return 0;
 }
 
-int heap_can_free(SObj heapCapability, void *rawPointer)
+__cheriot_minimum_stack(0xe0) int heap_can_free(SObj  heapCapability,
+                                                void *rawPointer)
 {
+	STACK_CHECK(0xe0);
 	LockGuard g{lock};
 	return heap_free_internal(heapCapability, rawPointer, false);
 }
 
-int heap_free(SObj heapCapability, void *rawPointer)
+__cheriot_minimum_stack(0x250) int heap_free(SObj  heapCapability,
+                                             void *rawPointer)
 {
+	STACK_CHECK(0x250);
 	LockGuard g{lock};
 	int       ret = heap_free_internal(heapCapability, rawPointer, true);
 	if (ret != 0)
@@ -904,8 +916,9 @@
 	return 0;
 }
 
-ssize_t heap_free_all(SObj heapCapability)
+__cheriot_minimum_stack(0x180) ssize_t heap_free_all(SObj heapCapability)
 {
+	STACK_CHECK(0x180);
 	LockGuard g{lock};
 	auto     *capability = malloc_capability_unseal(heapCapability);
 	if (capability == nullptr)
@@ -942,11 +955,12 @@
 	return freed;
 }
 
-void *heap_allocate_array(Timeout *timeout,
-                          SObj     heapCapability,
-                          size_t   nElements,
-                          size_t   elemSize)
+__cheriot_minimum_stack(0x1f0) void *heap_allocate_array(Timeout *timeout,
+                                                         SObj   heapCapability,
+                                                         size_t nElements,
+                                                         size_t elemSize)
 {
+	STACK_CHECK(0x1f0);
 	if (!check_timeout_pointer(timeout))
 	{
 		return nullptr;
@@ -1087,12 +1101,14 @@
 	return nullptr;
 }
 
-SObj token_sealed_unsealed_alloc(Timeout *timeout,
-                                 SObj     heapCapability,
-                                 SKey     key,
-                                 size_t   sz,
-                                 void   **unsealed)
+__cheriot_minimum_stack(0x250) SObj
+  token_sealed_unsealed_alloc(Timeout *timeout,
+                              SObj     heapCapability,
+                              SKey     key,
+                              size_t   sz,
+                              void   **unsealed)
 {
+	STACK_CHECK(0x250);
 	if (!check_timeout_pointer(timeout))
 	{
 		return INVALID_SOBJ;
@@ -1112,11 +1128,12 @@
 	return INVALID_SOBJ;
 }
 
-SObj token_sealed_alloc(Timeout *timeout,
-                        SObj     heapCapability,
-                        SKey     rawKey,
-                        size_t   sz)
+__cheriot_minimum_stack(0x250) SObj token_sealed_alloc(Timeout *timeout,
+                                                       SObj     heapCapability,
+                                                       SKey     rawKey,
+                                                       size_t   sz)
 {
+	STACK_CHECK(0x250);
 	return allocate_sealed_unsealed(
 	         timeout, heapCapability, rawKey, sz, {Permission::Seal})
 	  .first;
@@ -1149,8 +1166,11 @@
 	return unsealed;
 }
 
-int token_obj_destroy(SObj heapCapability, SKey key, SObj object)
+__cheriot_minimum_stack(0x250) int token_obj_destroy(SObj heapCapability,
+                                                     SKey key,
+                                                     SObj object)
 {
+	STACK_CHECK(0x250);
 	void *unsealed;
 	{
 		LockGuard g{lock};
@@ -1168,8 +1188,11 @@
 	return heap_free(heapCapability, unsealed);
 }
 
-int token_obj_can_destroy(SObj heapCapability, SKey key, SObj object)
+__cheriot_minimum_stack(0xe0) int token_obj_can_destroy(SObj heapCapability,
+                                                        SKey key,
+                                                        SObj object)
 {
+	STACK_CHECK(0xe0);
 	void *unsealed;
 	{
 		LockGuard g{lock};
diff --git a/sdk/core/allocator/revoker.h b/sdk/core/allocator/revoker.h
index 5d25b97..e484b4f 100644
--- a/sdk/core/allocator/revoker.h
+++ b/sdk/core/allocator/revoker.h
@@ -384,31 +384,32 @@
 		template<bool AllowPartial = false>
 		uint32_t has_revocation_finished_for_epoch(uint32_t previousEpoch)
 		{
+			auto current = *epoch;
 			// If the revoker is running, prod it to do a bit more work every
 			// time that it's queried.
-			if ((*epoch & 1) == 1)
+			if ((current & 1) == 1)
 			{
 				revoker_tick();
+				current = *epoch;
 			}
+			// We want to know if current is greater than epoch, but current
+			// may have wrapped.  Perform unsigned subtraction (guaranteed to
+			// wrap) and then coerce the result to a signed value.  This will
+			// be correct unless we have more than 2^31 revocations in between
+			// checks
+			std::make_signed_t<decltype(current)> distance =
+			  current - previousEpoch;
 			if (AllowPartial)
 			{
-				return *epoch > previousEpoch;
+				return distance >= 0;
 			}
-			// We want to check if a complete revocation pass has happened
-			// since the last query of the epoch.  If the last query happened
-			// in the middle of revocation then the low bit will be 1.  In this
-			// case, we need to ensure that the revocation epoch has
-			// incremented by 3 (finished the in-progress sweep, started and
-			// finished the next one).  In the other case, where revocation was
-			// not in progress, we need to ensure only that one complete
-			// revocation pass has happened and so the epoch counter was
-			// incremented twice.
-			//
-			// We perform a subtraction first because unsigned overflow is
-			// well-defined in C to wrap and so, as long as we don't manage to
-			// do 2^31 revocation sweeps without the consumer noticing then
-			// this will give the correct result even in overflow cases.
-			return *epoch - previousEpoch >= (2 + (previousEpoch & 1));
+			// The allocator stores the epoch when things can be popped, which
+			// is always a complete (even) epoch.
+			Debug::Assert((previousEpoch & 1) == 0, "Epoch must be even");
+			// If the current epoch is odd then the epoch needs to be at least
+			// two more, to capture the fact that this is a complete epoch.
+			decltype(distance) minimumRequired = 1 + (previousEpoch & 1);
+			return distance > minimumRequired;
 		}
 
 		/// Start revocation running.
diff --git a/sdk/core/allocator/token.h b/sdk/core/allocator/token.h
index 55f0385..1c75670 100644
--- a/sdk/core/allocator/token.h
+++ b/sdk/core/allocator/token.h
@@ -59,7 +59,7 @@
 	{
 		return {reinterpret_cast<uintptr_t>(
 		          static_cast<const volatile void *>(value)),
-		        DebugFormatArgument::Pointer};
+		        DebugFormatArgumentKind::DebugFormatArgumentPointer};
 	}
 };
 
diff --git a/sdk/core/loader/boot.S b/sdk/core/loader/boot.S
index 60da6f0..ae915cb 100644
--- a/sdk/core/loader/boot.S
+++ b/sdk/core/loader/boot.S
@@ -13,10 +13,10 @@
     .p2align 2
     .type start,@function
 start:
-	cjal			.Lregs_clear
-	// The common register clearing function will not zero these registers.
-	zeroRegisters	ra, sp, gp, a0
-	// At this point all registers are cleared.
+	// Most hardware will have zeroed all general-purposes registers at this
+	// point, but any code between here and the call into the scheduler has
+	// access to the root capabilities and so we are not concerned about
+	// information / capability leakage.
 #if __has_include(<platform-early_boot.inc>)
 #	include <platform-early_boot.inc>
 #endif
@@ -142,7 +142,6 @@
 	addi			s1, s1, -16
 	csetbounds		ca0, csp, s1
 
-.Lregs_clear:
 	// a0 is used to pass arguments to the scheduler entry.
 	zeroAllRegistersExcept	ra, sp, gp, a0
 	cjalr			cra
diff --git a/sdk/core/loader/boot.cc b/sdk/core/loader/boot.cc
index 26bb5a8..16a5ee6 100644
--- a/sdk/core/loader/boot.cc
+++ b/sdk/core/loader/boot.cc
@@ -464,13 +464,29 @@
 					}
 				}
 			}
-			Debug::Invariant(entry.address >= LA_ABS(__mmio_region_start),
-			                 "{} is not in the MMIO range",
-			                 entry.address);
-			Debug::Invariant(entry.address + entry.size() <=
-			                   LA_ABS(__mmio_region_end),
-			                 "{} is not in the MMIO range",
-			                 entry.address + entry.size());
+			// MMIO regions should be within the range of the MMIO space.  As a
+			// special case (to be removed at some point when we have a generic
+			// way of setting up shared objects), allow the hazard-pointer
+			// region and hazard epoch to be excluded from this test.
+			Debug::Invariant(
+			  ((entry.address >= LA_ABS(__mmio_region_start)) &&
+			   (entry.address + entry.size() <= LA_ABS(__mmio_region_end))) ||
+			    ((entry.address == LA_ABS(__export_mem_allocator_epoch)) &&
+			     (entry.address + entry.size() ==
+			      LA_ABS(__export_mem_allocator_epoch_end))) ||
+			    ((entry.address == LA_ABS(__export_mem_hazard_pointers)) &&
+			     (entry.address + entry.size() ==
+			      LA_ABS(__export_mem_hazard_pointers_end))),
+			  "{}--{} is not in the MMIO range ({}--{}) or the hazard pointer "
+			  "range ({}--{}) or the allocator epoch range ({}--{})",
+			  entry.address,
+			  entry.address + entry.size(),
+			  LA_ABS(__mmio_region_start),
+			  LA_ABS(__mmio_region_end),
+			  LA_ABS(__export_mem_hazard_pointers),
+			  LA_ABS(__export_mem_hazard_pointers_end),
+			  LA_ABS(__export_mem_allocator_epoch),
+			  LA_ABS(__export_mem_allocator_epoch_end));
 			auto ret = build(entry.address, entry.size());
 			// Remove any permissions that shouldn't be held here.
 			ret.permissions() &= entry.permissions();
@@ -1205,7 +1221,7 @@
 	for (auto &compartment : imgHdr.libraries_and_compartments())
 	{
 		auto expTablePtr = getExportTableHeader(compartment.exportTable);
-		Debug::log("Error handler for compartment is {})",
+		Debug::log("Error handler for compartment is {}",
 		           expTablePtr->errorHandler);
 		expTablePtr->pcc = build_pcc(compartment);
 		expTablePtr->cgp = build_cgp(compartment);
diff --git a/sdk/core/scheduler/common.h b/sdk/core/scheduler/common.h
index 503809f..565152d 100644
--- a/sdk/core/scheduler/common.h
+++ b/sdk/core/scheduler/common.h
@@ -30,6 +30,20 @@
 	  ;
 
 	using Debug = ConditionalDebug<DebugScheduler, "Scheduler">;
+
+	constexpr StackCheckMode StackMode =
+#if CHERIOT_STACK_CHECKS_SCHEDULER
+	  StackCheckMode::Asserting
+#else
+	  StackCheckMode::Disabled
+	// Uncomment if checks failed to find the correct values
+	// StackCheckMode::Logging
+#endif
+	  ;
+
+#define STACK_CHECK(expected)                                                  \
+	StackUsageCheck<StackMode, expected, __PRETTY_FUNCTION__> stackCheck
+
 	/**
 	 * Base class for types that are exported from the scheduler with a common
 	 * sealing type.  Includes an inline type marker.
@@ -233,6 +247,10 @@
 		HeapObject(struct SObjStruct *heapCapability, T *allocatedObject)
 		  : pointer(allocatedObject, heapCapability)
 		{
+			if (!__builtin_cheri_tag_get(allocatedObject))
+			{
+				pointer = nullptr;
+			}
 		}
 
 		/**
@@ -250,10 +268,14 @@
 		              heap_allocate(timeout, heapCapability, sizeof(T))),
 		            {heapCapability})
 		{
-			if (pointer)
+			if (__builtin_cheri_tag_get(pointer.raw()))
 			{
 				new (pointer.get()) T(std::forward<Args>(args)...);
 			}
+			else
+			{
+				pointer = nullptr;
+			}
 		}
 
 		/**
diff --git a/sdk/core/scheduler/main.cc b/sdk/core/scheduler/main.cc
index c06f19a..88442be 100644
--- a/sdk/core/scheduler/main.cc
+++ b/sdk/core/scheduler/main.cc
@@ -418,8 +418,10 @@
 	return ret;
 }
 
-int __cheri_compartment("sched") thread_sleep(Timeout *timeout)
+__cheriot_minimum_stack(0x80) int __cheri_compartment("sched")
+  thread_sleep(Timeout *timeout)
 {
+	STACK_CHECK(0x80);
 	if (!check_timeout_pointer(timeout))
 	{
 		return -EINVAL;
@@ -428,11 +430,12 @@
 	return 0;
 }
 
-int futex_timed_wait(Timeout        *timeout,
-                     const uint32_t *address,
-                     uint32_t        expected,
-                     FutexWaitFlags  flags)
+__cheriot_minimum_stack(0xa0) int futex_timed_wait(Timeout        *timeout,
+                                                   const uint32_t *address,
+                                                   uint32_t        expected,
+                                                   FutexWaitFlags  flags)
 {
+	STACK_CHECK(0xa0);
 	if (!check_timeout_pointer(timeout) ||
 	    !check_pointer<PermissionSet{Permission::Load}>(address))
 	{
@@ -512,8 +515,9 @@
 	return 0;
 }
 
-int futex_wake(uint32_t *address, uint32_t count)
+__cheriot_minimum_stack(0x90) int futex_wake(uint32_t *address, uint32_t count)
 {
+	STACK_CHECK(0x90);
 	if (!check_pointer<PermissionSet{Permission::Store}>(address))
 	{
 		return -EINVAL;
@@ -552,11 +556,13 @@
 	return woke;
 }
 
-int multiwaiter_create(Timeout           *timeout,
-                       struct SObjStruct *heapCapability,
-                       MultiWaiter      **ret,
-                       size_t             maxItems)
+__cheriot_minimum_stack(0x50) int multiwaiter_create(
+  Timeout           *timeout,
+  struct SObjStruct *heapCapability,
+  MultiWaiter      **ret,
+  size_t             maxItems)
 {
+	STACK_CHECK(0x50);
 	int error;
 	// Don't bother checking if timeout is valid, the allocator will check for
 	// us.
@@ -570,16 +576,20 @@
 	return write_result(reinterpret_cast<void **>(ret), mw);
 }
 
-int multiwaiter_delete(struct SObjStruct *heapCapability, MultiWaiter *mw)
+__cheriot_minimum_stack(0x60) int multiwaiter_delete(
+  struct SObjStruct *heapCapability,
+  MultiWaiter       *mw)
 {
+	STACK_CHECK(0x60);
 	return deallocate<MultiWaiterInternal>(heapCapability, mw);
 }
 
-int multiwaiter_wait(Timeout           *timeout,
-                     MultiWaiter       *waiter,
-                     EventWaiterSource *events,
-                     size_t             newEventsCount)
+__cheriot_minimum_stack(0xb0) int multiwaiter_wait(Timeout           *timeout,
+                                                   MultiWaiter       *waiter,
+                                                   EventWaiterSource *events,
+                                                   size_t newEventsCount)
 {
+	STACK_CHECK(0xb0);
 	return typed_op<MultiWaiterInternal>(waiter, [&](MultiWaiterInternal &mw) {
 		if (newEventsCount > mw.capacity())
 		{
@@ -667,9 +677,10 @@
 	};
 } // namespace
 
-[[cheri::interrupt_state(disabled)]] const uint32_t *
-interrupt_futex_get(struct SObjStruct *sealed)
+[[cheri::interrupt_state(disabled)]] __cheriot_minimum_stack(
+  0x20) const uint32_t *interrupt_futex_get(struct SObjStruct *sealed)
 {
+	STACK_CHECK(0x20);
 	auto     *interruptCapability = Handle::unseal<InterruptCapability>(sealed);
 	uint32_t *result              = nullptr;
 	if (interruptCapability && interruptCapability->state.mayWait)
@@ -686,9 +697,10 @@
 	return result;
 }
 
-[[cheri::interrupt_state(disabled)]] int
-interrupt_complete(struct SObjStruct *sealed)
+[[cheri::interrupt_state(disabled)]] __cheriot_minimum_stack(
+  0x10) int interrupt_complete(struct SObjStruct *sealed)
 {
+	STACK_CHECK(0x10);
 	auto *interruptCapability = Handle::unseal<InterruptCapability>(sealed);
 	if (interruptCapability && interruptCapability->state.mayComplete)
 	{
diff --git a/sdk/core/scheduler/multiwait.h b/sdk/core/scheduler/multiwait.h
index d5630d0..23710dc 100644
--- a/sdk/core/scheduler/multiwait.h
+++ b/sdk/core/scheduler/multiwait.h
@@ -194,7 +194,7 @@
 			                        heapCapability,
 			                        sizeof(MultiWaiterInternal) +
 			                          (length * sizeof(EventWaiter)));
-			if (q == nullptr)
+			if (!__builtin_cheri_tag_get(q))
 			{
 				error = -ENOMEM;
 				return {};
diff --git a/sdk/core/switcher/entry.S b/sdk/core/switcher/entry.S
index 93baf4a..8836d08 100644
--- a/sdk/core/switcher/entry.S
+++ b/sdk/core/switcher/entry.S
@@ -3,6 +3,7 @@
 
 #include "export-table-assembly.h"
 #include "trusted-stack-assembly.h"
+#include <errno.h>
 
 .include "assembly-helpers.s"
 
@@ -246,6 +247,17 @@
 	// table pointer.  Nothing between this point and transition to the callee
 	// should fault.
 	csc                ct1, TrustedStackFrame_offset_calleeExportTable(ctp)
+
+	// Load the minimum stack size required by the callee.
+	clbu               tp, ExportEntry_offset_minimumStackSize(ct1)
+	// The stack size is in 8-byte units, so multiply by 8.
+	slli               tp, tp, 3
+	// Check that the stack is large enough for the callee.
+	// At this point, we have already truncated the stack and so the length of
+	// the stack is the length that the callee can use.
+	cgetlen            t2, csp
+	bgtu               tp, t2, .Lstack_too_small
+
 	// Get the flags field into tp
 	clbu               tp, ExportEntry_offset_flags(ct1)
 	cgetbase           s1, ct1
@@ -289,6 +301,7 @@
 	zeroRegisters      tp, t1, t2, s0, s1
 	cjalr              cra
 
+.Lskip_compartment_call:
 	// If we are doing a forced unwind of the trusted stack then we do almost
 	// exactly the same as a normal unwind.  We will jump here from the
 	// exception path.
@@ -302,6 +315,15 @@
 	// ca1, used for second return value
 	zeroAllRegistersExcept ra, sp, gp, s0, s1, a0, a1
 	cret
+
+	// If the stack is too small, we don't do the call, but to avoid leaking
+	// any other state we still go through the same return path as normal.  We
+	// set the return registers to -ENOTENOUGHSTACK and 0, so users can see
+	// that this is the failure reason.
+.Lstack_too_small:
+	li                 a0, -ENOTENOUGHSTACK
+	li                 a1, 0
+	j                  .Lskip_compartment_call
 .size compartment_switcher_entry, . - compartment_switcher_entry
 
 	// the entry point of all exceptions and interrupts
@@ -952,6 +974,13 @@
 	cret
 
 
+	.section .text, "ax", @progbits
+	.p2align 2
+	.type __Z25stack_lowest_used_addressv,@function
+__Z25stack_lowest_used_addressv:
+	// Read the stack high-water mark into the return register.
+	csrr               a0, CSR_MSHWM
+	cret
 
 // The linker expects export tables to start with space for cgp and pcc, then
 // the compartment error handler.  We should eventually remove that requirement
@@ -986,3 +1015,4 @@
 export __Z23switcher_current_threadv
 export __Z28switcher_thread_hazard_slotsv
 export __Z13thread_id_getv
+export __Z25stack_lowest_used_addressv
diff --git a/sdk/include/FreeRTOS-Compat/queue.h b/sdk/include/FreeRTOS-Compat/queue.h
index 25c9e25..088d588 100644
--- a/sdk/include/FreeRTOS-Compat/queue.h
+++ b/sdk/include/FreeRTOS-Compat/queue.h
@@ -88,7 +88,7 @@
 {
 	QueueHandle_t  ret;
 	struct Timeout timeout = {0, UnlimitedTimeout};
-	ret                    = (QueueHandle_t)malloc(sizeof(struct QueueHandle));
+	ret                    = (QueueHandle_t)malloc(sizeof(*ret));
 	if (!ret)
 	{
 		return NULL;
diff --git a/sdk/include/__debug.h b/sdk/include/__debug.h
new file mode 100644
index 0000000..637e166
--- /dev/null
+++ b/sdk/include/__debug.h
@@ -0,0 +1,74 @@
+#pragma once
+#include <compartment-macros.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/**
+ * The kind of value, for values that have special-cased handling.
+ */
+enum DebugFormatArgumentKind : ptraddr_t
+{
+	/// Boolean, printed as "true" or "false".
+	DebugFormatArgumentBool,
+	/// Single character.
+	DebugFormatArgumentCharacter,
+	/// Signed 32-bit integer, printed as decimal.
+	DebugFormatArgumentSignedNumber32,
+	/// Unsigned 32-bit integer, printed as hexadecimal
+	DebugFormatArgumentUnsignedNumber32,
+	/// Signed 64-bit integer, printed as decimal.
+	DebugFormatArgumentSignedNumber64,
+	/// Unsigned 64-bit integer, printed as hexadecimal.
+	DebugFormatArgumentUnsignedNumber64,
+	/// Pointer, printed as a full capability.
+	DebugFormatArgumentPointer,
+	/// Special case for permission sets, printed as in the capability
+	/// format.
+	DebugFormatArgumentPermissionSet,
+	/// C string, printed as-is.
+	DebugFormatArgumentCString,
+	/// String view, printed as-is.
+	DebugFormatArgumentStringView,
+};
+
+struct DebugFormatArgument
+{
+	/**
+	 * The value that is being written.
+	 */
+	uintptr_t value;
+	/**
+	 * The kind of value that is being written.  This is either a pointer
+	 * to a `DebugCallback` or one of the `Kind` enumeration, depending on
+	 * whether the tag is set or not.
+	 */
+	uintptr_t kind;
+};
+
+/**
+ * Library function that writes a debug message.  This runs with interrupts
+ * disabled (to avoid interleaving) and prints an array of debug messages. This
+ * is intended to allow a single call to print multiple format strings without
+ * requiring the format strings to be copied, so that the debugging APIs can
+ * wrap a user-provided format string.
+ */
+__cheri_libcall void
+debug_log_message_write(const char                 *context,
+                        const char                 *format,
+                        struct DebugFormatArgument *messages,
+                        size_t                      messageCount);
+
+/**
+ * Helper to write a debug message reporting an assertion or invariant failure.
+ * This should be used only with the helper macros / templates in `debug.h[h]`.
+ * This takes the kind of failure (for example, assert or invariant), the file,
+ * function, and line number where the failure occurred, a format string, and
+ * an array of arguments to the format string.
+ */
+__cheri_libcall void debug_report_failure(const char                 *kind,
+                                          const char                 *file,
+                                          const char                 *function,
+                                          int                         line,
+                                          const char                 *fmt,
+                                          struct DebugFormatArgument *arguments,
+                                          size_t argumentCount);
diff --git a/sdk/include/__macro_map.h b/sdk/include/__macro_map.h
new file mode 100644
index 0000000..ac507b7
--- /dev/null
+++ b/sdk/include/__macro_map.h
@@ -0,0 +1,69 @@
+/*
+ * Created by William Swanson in 2012.
+ *
+ * I, William Swanson, dedicate this work to the public domain.
+ * I waive all rights to the work worldwide under copyright law,
+ * including all related and neighboring rights,
+ * to the extent allowed by law.
+ *
+ * You can copy, modify, distribute and perform the work,
+ * even for commercial purposes, all without asking permission.
+ *
+ * CHERIoT modifications:
+ * - Added CHERIOT_ prefix to all macros to avoid namespace pollution
+ * - Used pragma once instead of include guards
+ */
+
+#pragma once
+
+#define CHERIOT_EVAL0(...) __VA_ARGS__
+#define CHERIOT_EVAL1(...)                                                     \
+	CHERIOT_EVAL0(CHERIOT_EVAL0(CHERIOT_EVAL0(__VA_ARGS__)))
+#define CHERIOT_EVAL2(...)                                                     \
+	CHERIOT_EVAL1(CHERIOT_EVAL1(CHERIOT_EVAL1(__VA_ARGS__)))
+#define CHERIOT_EVAL3(...)                                                     \
+	CHERIOT_EVAL2(CHERIOT_EVAL2(CHERIOT_EVAL2(__VA_ARGS__)))
+#define CHERIOT_EVAL4(...)                                                     \
+	CHERIOT_EVAL3(CHERIOT_EVAL3(CHERIOT_EVAL3(__VA_ARGS__)))
+#define CHERIOT_EVAL(...)                                                      \
+	CHERIOT_EVAL4(CHERIOT_EVAL4(CHERIOT_EVAL4(__VA_ARGS__)))
+
+#define CHERIOT_MAP_END(...)
+#define CHERIOT_MAP_OUT
+#define CHERIOT_MAP_COMMA ,
+
+#define CHERIOT_MAP_GET_END2() 0, CHERIOT_MAP_END
+#define CHERIOT_MAP_GET_END1(...) CHERIOT_MAP_GET_END2
+#define CHERIOT_MAP_GET_END(...) CHERIOT_MAP_GET_END1
+#define CHERIOT_MAP_NEXT0(test, next, ...) next CHERIOT_MAP_OUT
+#define CHERIOT_MAP_NEXT1(test, next) CHERIOT_MAP_NEXT0(test, next, 0)
+#define CHERIOT_MAP_NEXT(test, next)                                           \
+	CHERIOT_MAP_NEXT1(CHERIOT_MAP_GET_END test, next)
+
+#define CHERIOT_MAP0(f, x, peek, ...)                                          \
+	f(x) CHERIOT_MAP_NEXT(peek, CHERIOT_MAP1)(f, peek, __VA_ARGS__)
+#define CHERIOT_MAP1(f, x, peek, ...)                                          \
+	f(x) CHERIOT_MAP_NEXT(peek, CHERIOT_MAP0)(f, peek, __VA_ARGS__)
+
+#define CHERIOT_MAP_LIST_NEXT1(test, next)                                     \
+	CHERIOT_MAP_NEXT0(test, CHERIOT_MAP_COMMA next, 0)
+#define CHERIOT_MAP_LIST_NEXT(test, next)                                      \
+	CHERIOT_MAP_LIST_NEXT1(CHERIOT_MAP_GET_END test, next)
+
+#define CHERIOT_MAP_LIST0(f, x, peek, ...)                                     \
+	f(x) CHERIOT_MAP_LIST_NEXT(peek, CHERIOT_MAP_LIST1)(f, peek, __VA_ARGS__)
+#define CHERIOT_MAP_LIST1(f, x, peek, ...)                                     \
+	f(x) CHERIOT_MAP_LIST_NEXT(peek, CHERIOT_MAP_LIST0)(f, peek, __VA_ARGS__)
+
+/**
+ * Applies the function macro `f` to each of the remaining parameters.
+ */
+#define CHERIOT_MAP(f, ...)                                                    \
+	CHERIOT_EVAL(CHERIOT_MAP1(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0))
+
+/**
+ * Applies the function macro `f` to each of the remaining parameters and
+ * inserts commas between the results.
+ */
+#define CHERIOT_MAP_LIST(f, ...)                                               \
+	CHERIOT_EVAL(CHERIOT_MAP_LIST1(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0))
diff --git a/sdk/include/cdefs.h b/sdk/include/cdefs.h
index b1f30dc..1c6eca3 100644
--- a/sdk/include/cdefs.h
+++ b/sdk/include/cdefs.h
@@ -62,6 +62,13 @@
 #define __alloc_size(x) __attribute__((alloc_size(x)))
 #define __alloc_align(x) __attribute__((alloc_align(x)))
 #define __cheri_callback __attribute__((cheri_ccallback))
+#if __has_attribute(cheriot_minimum_stack)
+#	define __cheriot_minimum_stack(x) __attribute__((cheriot_minimum_stack(x)))
+#else
+#	warning                                                                    \
+	  "cheriot_minimum_stack attribute not supported, please update your compiler"
+#	define __cheriot_minimum_stack(x)
+#endif
 // When running clang-tidy, we use the same compile flags for everything and so
 // will get errors about things being defined in the wrong compartment, so
 // define away the compartment name and pretend everything is local for now.
diff --git a/sdk/include/debug.h b/sdk/include/debug.h
new file mode 100644
index 0000000..9e6a56a
--- /dev/null
+++ b/sdk/include/debug.h
@@ -0,0 +1,139 @@
+#pragma once
+#include <__debug.h>
+#include <__macro_map.h>
+
+#ifndef __cplusplus
+
+/**
+ * Helper macro to convert the type of an argument to the corresponding
+ * `DebugFormatArgument` value.
+ *
+ * Should not be used directly.
+ */
+#	define CHERIOT_DEBUG_MAP_ARGUMENT(x)                                                \
+		{                                                                                \
+			(uintptr_t)(x), _Generic((x),                                              \
+                    _Bool: DebugFormatArgumentBool,                                               \
+                    char: DebugFormatArgumentCharacter,                                           \
+                    short: DebugFormatArgumentSignedNumber32,                                \
+                    unsigned short: DebugFormatArgumentUnsignedNumber32,                            \
+                    int: DebugFormatArgumentSignedNumber32,                                \
+                    unsigned int: DebugFormatArgumentUnsignedNumber32,                            \
+                    signed long long: DebugFormatArgumentSignedNumber64,                          \
+                    unsigned long long: DebugFormatArgumentUnsignedNumber64,                      \
+                    char *: DebugFormatArgumentCString, \
+					default: DebugFormatArgumentPointer) \
+		}
+
+/**
+ * Helper to map a list of arguments to an initialiser for a
+ * `DebugFormatArgument` array.
+ *
+ * Should not be used directly.
+ */
+#	define CHERIOT_DEBUG_MAP_ARGUMENTS(...)                                   \
+		CHERIOT_MAP_LIST(CHERIOT_DEBUG_MAP_ARGUMENT, __VA_ARGS__)
+
+/**
+ * Macro that logs a message.  The `context` argument is a string that is
+ * printed in magenta at the start of the line, followed by the format string.
+ * Each format argument is referenced with {} in the format string and is
+ * inserted in the output with the default rendering for that type.
+ */
+#	define CHERIOT_DEBUG_LOG(context, msg, ...)                               \
+		do                                                                     \
+		{                                                                      \
+			struct DebugFormatArgument args[] = {                              \
+			  __VA_OPT__(CHERIOT_DEBUG_MAP_ARGUMENTS(__VA_ARGS__))};           \
+			debug_log_message_write(                                           \
+			  context,                                                         \
+			  msg,                                                             \
+			  args,                                                            \
+			  0 __VA_OPT__(+(sizeof(args) / sizeof(args[0]))));                \
+		} while (0)
+
+/**
+ * Assert that `condition` is true, printing a message and aborting the
+ * compartment invocation if not.
+ */
+#	define CHERIOT_INVARIANT(condition, msg, ...)                             \
+		do                                                                     \
+		{                                                                      \
+			if (!(condition))                                                  \
+			{                                                                  \
+				struct DebugFormatArgument args[] = {                          \
+				  __VA_OPT__(CHERIOT_DEBUG_MAP_ARGUMENTS(__VA_ARGS__))};       \
+				debug_report_failure(                                          \
+				  "Invariant",                                                 \
+				  __FILE__,                                                    \
+				  __func__,                                                    \
+				  __LINE__,                                                    \
+				  msg,                                                         \
+				  args,                                                        \
+				  0 __VA_OPT__(+(sizeof(args) / sizeof(args[0]))));            \
+				__builtin_trap();                                              \
+			}                                                                  \
+		} while (0)
+
+#else
+#	include <debug.hh>
+
+namespace
+{
+	template<typename... Args>
+	__always_inline void
+	cheriot_debug_log(const char *context, const char *msg, Args... args)
+	{
+		DebugFormatArgument arguments[sizeof...(Args)];
+		make_debug_arguments_list(arguments, args...);
+		debug_log_message_write(context, msg, arguments, sizeof...(Args));
+	}
+
+	template<typename... Args>
+	__always_inline void cheriot_invariant(bool        condition,
+	                                       const char *file,
+	                                       const char *function,
+	                                       int         line,
+	                                       const char *msg,
+	                                       Args... args)
+	{
+		if (!condition)
+		{
+			DebugFormatArgument arguments[sizeof...(Args)];
+			make_debug_arguments_list(arguments, args...);
+			debug_report_failure("Invariant",
+			                     file,
+			                     function,
+			                     line,
+			                     msg,
+			                     arguments,
+			                     sizeof...(Args));
+		}
+	}
+} // namespace
+
+/**
+ * C++ version of `CHERIOT_DEBUG_LOG`.  This uses the C++ helpers and so will
+ * pretty-print a richer set of types than the C version.
+ */
+#	define CHERIOT_DEBUG_LOG(context, msg, ...)                               \
+		do                                                                     \
+		{                                                                      \
+			cheriot_debug_log(context, msg __VA_OPT__(, ) __VA_ARGS__);        \
+		} while (0)
+
+/**
+ * C++ version of `CHERIOT_INVARIANT`.  This uses the C++ helpers and so will
+ * pretty-print a richer set of types than the C version.
+ */
+#	define CHERIOT_INVARIANT(condition, msg, ...)                             \
+		do                                                                     \
+		{                                                                      \
+			cheriot_invariant(condition,                                       \
+			                  __FILE__,                                        \
+			                  __func__,                                        \
+			                  __LINE__,                                        \
+			                  msg __VA_OPT__(, ) __VA_ARGS__);                 \
+		} while (0)
+
+#endif
diff --git a/sdk/include/debug.hh b/sdk/include/debug.hh
index d107a78..21fb6b8 100644
--- a/sdk/include/debug.hh
+++ b/sdk/include/debug.hh
@@ -2,13 +2,14 @@
 // SPDX-License-Identifier: MIT
 
 #pragma once
-#include "cdefs.h"
+#include <__debug.h>
 #include <cheri.hh>
 #include <compartment.h>
 #include <concepts>
 #include <cstddef>
 #include <platform-uart.hh>
 #include <string.h>
+#include <switcher.h>
 
 #include <array>
 #include <string_view>
@@ -101,48 +102,6 @@
  */
 using DebugCallback = void (*)(uintptr_t, DebugWriter &);
 
-struct DebugFormatArgument
-{
-	/**
-	 * The kind of value, for values that have special-cased handling.
-	 */
-	enum Kind : ptraddr_t
-	{
-		/// Boolean, printed as "true" or "false".
-		Bool,
-		/// Single character.
-		Character,
-		/// Signed 32-bit integer, printed as decimal.
-		SignedNumber32,
-		/// Unsigned 32-bit integer, printed as hexadecimal
-		UnsignedNumber32,
-		/// Signed 64-bit integer, printed as decimal.
-		SignedNumber64,
-		/// Unsigned 64-bit integer, printed as hexadecimal.
-		UnsignedNumber64,
-		/// Pointer, printed as a full capability.
-		Pointer,
-		/// Special case for permission sets, printed as in the capability
-		/// format.
-		PermissionSet,
-		/// C string, printed as-is.
-		CString,
-		/// String view, printed as-is.
-		StringView,
-	};
-
-	/**
-	 * The value that is being written.
-	 */
-	uintptr_t value;
-	/**
-	 * The kind of value that is being written.  This is either a pointer
-	 * to a `DebugCallback` or one of the `Kind` enumeration, depending on
-	 * whether the tag is set or not.
-	 */
-	uintptr_t kind;
-};
-
 /**
  * Adaptor that turns an argument of type `T` into a `DebugFormatArgument`.
  *
@@ -160,7 +119,8 @@
 {
 	__always_inline static DebugFormatArgument construct(bool value)
 	{
-		return {static_cast<uintptr_t>(value), DebugFormatArgument::Bool};
+		return {static_cast<uintptr_t>(value),
+		        DebugFormatArgumentKind::DebugFormatArgumentBool};
 	}
 };
 
@@ -172,7 +132,8 @@
 {
 	__always_inline static DebugFormatArgument construct(char value)
 	{
-		return {static_cast<uintptr_t>(value), DebugFormatArgument::Character};
+		return {static_cast<uintptr_t>(value),
+		        DebugFormatArgumentKind::DebugFormatArgumentCharacter};
 	}
 };
 
@@ -185,7 +146,7 @@
 	__always_inline static DebugFormatArgument construct(uint8_t value)
 	{
 		return {static_cast<uintptr_t>(value),
-		        DebugFormatArgument::UnsignedNumber32};
+		        DebugFormatArgumentKind::DebugFormatArgumentUnsignedNumber32};
 	}
 };
 
@@ -198,7 +159,7 @@
 	__always_inline static DebugFormatArgument construct(uint16_t value)
 	{
 		return {static_cast<uintptr_t>(value),
-		        DebugFormatArgument::UnsignedNumber32};
+		        DebugFormatArgumentKind::DebugFormatArgumentUnsignedNumber32};
 	}
 };
 
@@ -211,7 +172,7 @@
 	__always_inline static DebugFormatArgument construct(uint32_t value)
 	{
 		return {static_cast<uintptr_t>(value),
-		        DebugFormatArgument::UnsignedNumber32};
+		        DebugFormatArgumentKind::DebugFormatArgumentUnsignedNumber32};
 	}
 };
 
@@ -229,7 +190,8 @@
 	{
 		uintptr_t fudgedValue;
 		memcpy(&fudgedValue, &value, sizeof(fudgedValue));
-		return {fudgedValue, DebugFormatArgument::UnsignedNumber64};
+		return {fudgedValue,
+		        DebugFormatArgumentKind::DebugFormatArgumentUnsignedNumber64};
 	}
 };
 
@@ -242,7 +204,7 @@
 	__always_inline static DebugFormatArgument construct(int8_t value)
 	{
 		return {static_cast<uintptr_t>(value),
-		        DebugFormatArgument::SignedNumber32};
+		        DebugFormatArgumentKind::DebugFormatArgumentSignedNumber32};
 	}
 };
 
@@ -255,7 +217,7 @@
 	__always_inline static DebugFormatArgument construct(int16_t value)
 	{
 		return {static_cast<uintptr_t>(value),
-		        DebugFormatArgument::SignedNumber32};
+		        DebugFormatArgumentKind::DebugFormatArgumentSignedNumber32};
 	}
 };
 
@@ -268,7 +230,7 @@
 	__always_inline static DebugFormatArgument construct(int32_t value)
 	{
 		return {static_cast<uintptr_t>(value),
-		        DebugFormatArgument::SignedNumber32};
+		        DebugFormatArgumentKind::DebugFormatArgumentSignedNumber32};
 	}
 };
 
@@ -287,7 +249,8 @@
 		static_assert(sizeof(uintptr_t) == sizeof(uint64_t));
 		uintptr_t fudgedValue;
 		memcpy(&fudgedValue, &value, sizeof(fudgedValue));
-		return {fudgedValue, DebugFormatArgument::SignedNumber64};
+		return {fudgedValue,
+		        DebugFormatArgumentKind::DebugFormatArgumentSignedNumber64};
 	}
 };
 
@@ -300,7 +263,7 @@
 	__always_inline static DebugFormatArgument construct(const char *value)
 	{
 		return {reinterpret_cast<uintptr_t>(value),
-		        DebugFormatArgument::CString};
+		        DebugFormatArgumentKind::DebugFormatArgumentCString};
 	}
 };
 
@@ -317,7 +280,7 @@
 	construct(std::string_view &value)
 	{
 		return {reinterpret_cast<uintptr_t>(&value),
-		        DebugFormatArgument::StringView};
+		        DebugFormatArgumentKind::DebugFormatArgumentStringView};
 	}
 };
 
@@ -334,7 +297,7 @@
 	{
 #ifdef CHERIOT_AVOID_CAPRELOCS
 		return {static_cast<uintptr_t>(value),
-		        DebugFormatArgument::UnsignedNumber32};
+		        DebugFormatArgumentKind::DebugFormatArgumentUnsignedNumber32};
 #else
 		return {static_cast<uintptr_t>(value),
 		        reinterpret_cast<uintptr_t>(&debug_enum_helper<T>)};
@@ -352,7 +315,19 @@
 	construct(CHERI::PermissionSet value)
 	{
 		return {static_cast<uintptr_t>(value.as_raw()),
-		        DebugFormatArgument::PermissionSet};
+		        DebugFormatArgumentKind::DebugFormatArgumentPermissionSet};
+	}
+};
+
+/**
+ * Null pointer specialisation.
+ */
+template<>
+struct DebugFormatArgumentAdaptor<std::nullptr_t>
+{
+	__always_inline static DebugFormatArgument construct(std::nullptr_t)
+	{
+		return {0, DebugFormatArgumentKind::DebugFormatArgumentPointer};
 	}
 };
 
@@ -366,7 +341,7 @@
 	{
 		return {reinterpret_cast<uintptr_t>(
 		          static_cast<const volatile void *>(value)),
-		        DebugFormatArgument::Pointer};
+		        DebugFormatArgumentKind::DebugFormatArgumentPointer};
 	}
 };
 
@@ -382,7 +357,7 @@
 	{
 		return {reinterpret_cast<uintptr_t>(
 		          static_cast<const volatile void *>(value)),
-		        DebugFormatArgument::Pointer};
+		        DebugFormatArgumentKind::DebugFormatArgumentPointer};
 	}
 };
 
@@ -397,7 +372,7 @@
 	{
 		return {static_cast<uintptr_t>(value),
 
-		        DebugFormatArgument::UnsignedNumber32};
+		        DebugFormatArgumentKind::DebugFormatArgumentUnsignedNumber32};
 	}
 };
 
@@ -741,4 +716,92 @@
 		Assert(auto, const char *, Ts &&...) -> Assert<Ts...>;
 	};
 
+	enum class StackCheckMode
+	{
+		Disabled,
+		Logging,
+		Asserting,
+	};
+
+	/**
+	 * Check the (dynamic) stack usage of a function, including all of its
+	 * callees.  This is intended to be used in compartment entry points to
+	 * check the stack size that is required for that entry point.  Use this
+	 * with some test vectors that explore different code paths to identify the
+	 * maximum stack usage.  This can then be used with the
+	 * [[cheriot::minimum_stack]] attribute to ensure that the switcher will
+	 * never invoke the entry point with insufficient stack.
+	 *
+	 * Note that this records only the stack usage for the current compartment
+	 * invocation (including any invoked shared libraries).  If this
+	 * compartment calls others, then a larger stack may be required.  This
+	 * failure is recoverable: cross-compartment calls should always be assumed
+	 * to potentially fail.  This check is to ensure that an attacker cannot
+	 * cause stack overflow to crash the compartment, not to ensure that an
+	 * attacker cannot cause stack overflow to make the operation that they
+	 * requested fail.
+	 *
+	 * Stack checks run in one of three modes:
+	 *
+	 *  - Disabled: The checks do not run.
+	 *  - Logging: The checks run and print a message if the stack usage
+	 *    exceeds expectations.
+	 *  - Asserting: After printing the message, the function will trap.
+	 *
+	 * In logging mode, one message will be printed for each invocation of the
+	 * function that exceeds the previous maximum stack usage.
+	 *
+	 * An instance of this should be created as a local variable on entry to a
+	 * function.  The destructor (which prints the message) will then run after
+	 * any other destructors, ensuring the most accurate stack usage
+	 * measurement.
+	 *
+	 * Unfortunately, default arguments for templates are not evaluated in the
+	 * enclosing scope and so `__builtin_FUNCTION()` cannot be used here.
+	 * Instead, we must pass this explicitly, typically as something like:
+	 *
+	 * ```c++
+	 * StackUsageCheck<DebugFlag, 128, __PRETTY_FUNCTION__> stackCheck;
+	 * ```
+	 */
+	template<StackCheckMode Mode, size_t Expected, DebugContext Fn>
+	class StackUsageCheck
+	{
+		/**
+		 * The expected maximum.  This class is templated on the function name
+		 * and so there is one copy of this per function.
+		 */
+		static inline size_t stackUsage = Expected;
+
+		public:
+		/**
+		 * Default constructor, does nothing.
+		 */
+		StackUsageCheck() = default;
+
+		/**
+		 * Destructor, runs at the end of the function to print the message.
+		 */
+		__always_inline ~StackUsageCheck()
+		{
+			if constexpr (Mode != StackCheckMode::Disabled)
+			{
+				ptraddr_t lowest = stack_lowest_used_address();
+				ptraddr_t highest =
+				  CHERI::Capability{__builtin_cheri_stack_get()}.top();
+				size_t used = highest - lowest;
+				if (used > stackUsage)
+				{
+					stackUsage = used;
+					ConditionalDebug<true, Fn>::log("Stack used: {} bytes",
+					                                stackUsage);
+					if constexpr (Mode == StackCheckMode::Asserting)
+					{
+						__builtin_trap();
+					}
+				}
+			}
+		}
+	};
+
 } // namespace
diff --git a/sdk/include/errno.h b/sdk/include/errno.h
index 1dadd3c..df74785 100644
--- a/sdk/include/errno.h
+++ b/sdk/include/errno.h
@@ -84,6 +84,7 @@
 #define EOWNERDEAD 130     // Previous owner died.
 #define ENOTRECOVERABLE 131 // State not recoverable.
 #define EOVERFLOW 139       // Value too large to be stored in data type.
+#define ENOTENOUGHSTACK 140 // Insufficient stack space for cross-compartment call.
 #define EWOULDBLOCK EAGAIN  // Operation would block.
 #define ENOTSUP EOPNOTSUPP  // Not supported.
 #define __ELASTERROR 2000   // Users can add values starting here.
diff --git a/sdk/include/event.h b/sdk/include/event.h
index 2c24484..0fa3d41 100644
--- a/sdk/include/event.h
+++ b/sdk/include/event.h
@@ -108,3 +108,12 @@
  */
 int __cheri_libcall eventgroup_destroy(struct SObjStruct *heapCapability,
                                        struct EventGroup *group);
+
+/**
+ * Destroy an event group without tacking the lock.
+ *
+ * This API is inherently racy. Its main purpose is to cleanup the event group
+ * in an error handler context, when taking lock may be impossible.
+ */
+int __cheri_libcall eventgroup_destroy_force(struct SObjStruct *heapCapability,
+                                             struct EventGroup *group);
diff --git a/sdk/include/fail-simulator-on-error.h b/sdk/include/fail-simulator-on-error.h
index 3fc904e..52a65d3 100644
--- a/sdk/include/fail-simulator-on-error.h
+++ b/sdk/include/fail-simulator-on-error.h
@@ -72,7 +72,10 @@
 		}
 		else
 		{
-			// An unexpected error -- log it and end the simulation with error.
+			// An unexpected error -- log it and end the simulation
+			// with error.  Note: handle CZR differently as
+			// `get_register_value` will return a nullptr which we
+			// cannot dereference.
 			DebugErrorHandler::log(
 			  "{} error at {} (return address: {}), with capability register "
 			  "{}: {}",
@@ -80,7 +83,9 @@
 			  frame->pcc,
 			  frame->get_register_value<CHERI::RegisterNumber::CRA>(),
 			  registerNumber,
-			  *frame->get_register_value(registerNumber));
+			  registerNumber == CHERI::RegisterNumber::CZR
+			    ? nullptr
+			    : *frame->get_register_value(registerNumber));
 			simulation_exit(1);
 		}
 	}
diff --git a/sdk/include/function_wrapper.hh b/sdk/include/function_wrapper.hh
new file mode 100644
index 0000000..50ca248
--- /dev/null
+++ b/sdk/include/function_wrapper.hh
@@ -0,0 +1,109 @@
+#include <cdefs.h>
+#include <functional>
+#include <tuple>
+
+/**
+ * Base template for `FunctionWrapper`, never used.
+ */
+template<typename FnType>
+class FunctionWrapper;
+
+/**
+ * A non-owning type-erased reference to a callable object.  This is used
+ * to pass lambdas (and similar) down the stack without increasing code
+ * size by template specialisation.  Instances of this class must not be
+ * stored, they should be used only for passing type-erased callable objects
+ * down the stack.
+ *
+ * Instances of this class are two words: a reference to the lambda, and a
+ * wrapper callback that invokes the lambda.
+ *
+ * This is similar to `std::function` but is non-owning and so is guaranteed
+ * not to allocate memory (the called function may capture memory).
+ */
+template<class R, class... Args>
+class FunctionWrapper<R(Args...)>
+{
+	/**
+	 * Storage for the type-erased function.  This holds the reference to
+	 * lambda and to the invoke function.
+	 */
+	alignas(void *) char storage[2 * sizeof(void *)];
+
+	/**
+	 * Base type for the type-erased function.  This defines the virtual
+	 * function that is used to invoke the captured lambda.
+	 */
+	struct ErasedFunctionWrapperBase
+	{
+		virtual R operator()(Args... args) = 0;
+	};
+
+	/**
+	 * Returns a pointer to the storage, cast to the type-erased function
+	 * type.
+	 */
+	ErasedFunctionWrapperBase &stored_function()
+	{
+		return *reinterpret_cast<ErasedFunctionWrapperBase *>(storage);
+	}
+
+	/**
+	 * Templated subclass that is specialised for each concrete callable
+	 * type `T` that is passed.  One instance of this will be created for
+	 * each lambda type, with a single method in its vtable that invokes
+	 * the lambda.
+	 */
+	template<typename T>
+	class ErasedFunctionWrapper : public ErasedFunctionWrapperBase
+	{
+		/// Pointer to the captured lambda.
+		T &&fn;
+
+		public:
+		/**
+		 * Invoke function.  This is virtual and overrides the version in
+		 * the parent class, allowing this to be called from code that does
+		 * not know the cocrete type of the lambda.
+		 */
+		R operator()(Args... args) override
+		{
+			return fn(std::forward<Args>(args)...);
+		}
+
+		/**
+		 * Construct the type-erased function wrapper, capturing the
+		 * lambda.
+		 */
+		ErasedFunctionWrapper(T &&fn) : fn{std::forward<T>(fn)} {}
+	};
+
+	public:
+	/**
+	 * This is a non-owning reference, delete its copy and move
+	 * constructors to avoid accidental copies.
+	 */
+	FunctionWrapper(FunctionWrapper &)  = delete;
+	FunctionWrapper(FunctionWrapper &&) = delete;
+	FunctionWrapper &operator=(FunctionWrapper &&) = delete;
+
+	/**
+	 * Construct the type-erased function wrapper, capturing the lambda.
+	 */
+	template<typename T>
+	__always_inline FunctionWrapper(T &&fn)
+	{
+		// Make sure that we got the size for the storage right!
+		static_assert(sizeof(storage) >= sizeof(ErasedFunctionWrapper<T>));
+		// Construct the type-erased function in place.
+		new (storage) ErasedFunctionWrapper<T>(std::forward<T>(fn));
+	}
+
+	/**
+	 * Invoke the captured lambda.
+	 */
+	__always_inline R operator()(Args... args)
+	{
+		return stored_function()(std::forward<Args>(args)...);
+	}
+};
diff --git a/sdk/include/locks.h b/sdk/include/locks.h
index 676e2d2..1dbab8c 100644
--- a/sdk/include/locks.h
+++ b/sdk/include/locks.h
@@ -135,6 +135,10 @@
  * currently attempting to acquire the lock will wake and fail to acquire the
  * lock.  This should be called before deallocating an object that contains a
  * lock.
+ *
+ * Note that callers do not need to hold the lock; the ability to upgrade for
+ * destruction without holding the lock is useful for cleaning up from the
+ * error handler.
  */
 void __cheri_libcall
 flaglock_upgrade_for_destruction(struct FlagLockState *lock);
diff --git a/sdk/include/locks.hh b/sdk/include/locks.hh
index 4d1050c..3f7808e 100644
--- a/sdk/include/locks.hh
+++ b/sdk/include/locks.hh
@@ -299,6 +299,16 @@
 	}
 
 	/**
+	 * Unwrap the lock without unlocking it. This is useful to call before
+	 * destroying a lock.
+	 */
+	void release()
+	{
+		wrappedLock = nullptr;
+		isOwned     = false;
+	}
+
+	/**
 	 * If the underlying lock type supports locking with a timeout, try to lock
 	 * it with the specified timeout. This must be called with the lock
 	 * unlocked.  Returns true if the lock has been acquired, false otherwise.
diff --git a/sdk/include/microvium/microvium_port.h b/sdk/include/microvium/microvium_port.h
index 5028346..1b2e4b1 100644
--- a/sdk/include/microvium/microvium_port.h
+++ b/sdk/include/microvium/microvium_port.h
@@ -298,8 +298,13 @@
  */
 #define MVM_CONTEXTUAL_MALLOC(size, context)                                   \
 	({                                                                         \
-		Timeout t = {0, 0};                                                    \
-		heap_allocate(&t, context, size);                                      \
+		Timeout t   = {0, 0};                                                  \
+		void   *ret = heap_allocate(&t, context, size);                        \
+		if (!__builtin_cheri_tag_get(ret))                                     \
+		{                                                                      \
+			ret = NULL;                                                        \
+		}                                                                      \
+		ret;                                                                   \
 	})
 #define MVM_CONTEXTUAL_FREE(ptr, context) heap_free(context, ptr)
 
diff --git a/sdk/include/platform/arty-a7/platform-ethernet.hh b/sdk/include/platform/arty-a7/platform-ethernet.hh
index 900fe1c..dc5392a 100644
--- a/sdk/include/platform/arty-a7/platform-ethernet.hh
+++ b/sdk/include/platform/arty-a7/platform-ethernet.hh
@@ -480,9 +480,18 @@
 	KunyanEthernet(const KunyanEthernet &) = delete;
 	KunyanEthernet(KunyanEthernet &&)      = delete;
 
+	/**
+	 * This device does not have a unique MAC address and so users must provide
+	 * a locally administered MAC address if more than one device is present on
+	 * the same network.
+	 */
+	static constexpr bool has_unique_mac_address()
+	{
+		return false;
+	}
+
 	static constexpr MACAddress mac_address_default()
 	{
-		// FIXME: Generate a random locally administered MAC for each device.
 		return {0x60, 0x6A, 0x6A, 0x37, 0x47, 0x88};
 	}
 
diff --git a/sdk/include/platform/concepts/ethernet.hh b/sdk/include/platform/concepts/ethernet.hh
index 4101cec..30ecc2f 100644
--- a/sdk/include/platform/concepts/ethernet.hh
+++ b/sdk/include/platform/concepts/ethernet.hh
@@ -40,6 +40,14 @@
 		T::mac_address_default()
 		} -> std::same_as<std::array<uint8_t, 6>>;
 	/**
+	 * Is the default MAC address unique?  If the device (e.g. soft MAC)
+	 * doesn't have its own hardware MAC address then callers may prefer to
+	 * generate a locally administered MAC address randomly.
+	 */
+	{
+		T::has_unique_mac_address()
+		} -> std::convertible_to<bool>;
+	/**
 	 * Set the MAC address of this adaptor.
 	 */
 	{adaptor.mac_address_set(macAddress)};
diff --git a/sdk/include/platform/ibex/platform-hardware_revoker.hh b/sdk/include/platform/ibex/platform-hardware_revoker.hh
index f088f0a..c039cce 100644
--- a/sdk/include/platform/ibex/platform-hardware_revoker.hh
+++ b/sdk/include/platform/ibex/platform-hardware_revoker.hh
@@ -7,6 +7,7 @@
 #include <compartment-macros.h>
 #include <futex.h>
 #include <interrupt.h>
+#include <limits>
 #include <riscvreg.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -126,11 +127,25 @@
 		uint32_t has_revocation_finished_for_epoch(uint32_t epoch)
 		{
 			auto current = system_epoch_get();
+			// We want to know if current is greater than epoch, but current
+			// may have wrapped.  Perform unsigned subtraction (guaranteed to
+			// wrap) and then coerce the result to a signed value.  This will
+			// be correct unless we have more than 2^31 revocations in between
+			// checks
+			std::make_signed_t<decltype(current)> distance = current - epoch;
 			if (AllowPartial)
 			{
-				return current > epoch;
+				return distance >= 0;
 			}
-			return current - epoch >= (2 + (epoch & 1));
+			// The allocator stores the epoch when things can be popped, which
+			// is always a complete (even) epoch.
+#ifndef CLANG_TIDY
+			Debug::Assert((epoch & 1) == 0, "Epoch must be even");
+#endif
+			// If the current epoch is odd then the epoch needs to be at least
+			// two more, to capture the fact that this is a complete epoch.
+			decltype(distance) minimumRequired = 1 + (epoch & 1);
+			return distance > minimumRequired;
 		}
 
 		/**
diff --git a/sdk/include/platform/sunburst/platform-early_boot.inc b/sdk/include/platform/sunburst/platform-early_boot.inc
new file mode 100644
index 0000000..d21e9fe
--- /dev/null
+++ b/sdk/include/platform/sunburst/platform-early_boot.inc
@@ -0,0 +1,7 @@
+	// The shadow memory may not be zeroed, ensure it is before we start or
+	// random capability loads will fail.
+	li         a0, SUNBURST_SHADOW_BASE
+	cspecialr  ca4, mtdc
+	csetaddr   ca0, ca4, a0
+	li         a1, SUNBURST_SHADOW_BASE + SUNBURST_SHADOW_SIZE
+	cjal       .Lfill_block
diff --git a/sdk/include/platform/sunburst/platform-gpio.hh b/sdk/include/platform/sunburst/platform-gpio.hh
new file mode 100644
index 0000000..9aae88b
--- /dev/null
+++ b/sdk/include/platform/sunburst/platform-gpio.hh
@@ -0,0 +1,132 @@
+#pragma once
+#include <cdefs.h>
+#include <stdint.h>
+
+/**
+ * Represents the state of the Sonata's joystick.
+ *
+ * Note, that up to three of the bits may be asserted at any given time.
+ * There may be up to two cardinal directions asserted when the joystick is
+ * pushed in a diagonal and the joystick may be pressed while being pushed in a
+ * given direction.
+ */
+enum class SonataJoystick : uint8_t
+{
+	Left    = 1 << 0,
+	Up      = 1 << 1,
+	Pressed = 1 << 2,
+	Down    = 1 << 3,
+	Right   = 1 << 4,
+};
+
+/**
+ * A Simple Driver for the Sonata's GPIO.
+ *
+ * Documentation source can be found at:
+ * https://github.com/lowRISC/sonata-system/blob/1a59633d2515d4fe186a07d53e49ff95c18d9bbf/doc/ip/gpio.md
+ *
+ * Rendered documentation is served from:
+ * https://lowrisc.org/sonata-system/doc/ip/gpio.html
+ */
+struct SonataGPIO
+{
+	uint32_t output;
+	uint32_t input;
+	uint32_t debouncedInput;
+	uint32_t debouncedThreshold;
+	uint32_t raspberryPiHeader;
+	uint32_t raspberryPiMask;
+	uint32_t arduinoShieldHeader;
+	uint32_t arduinoShieldMask;
+
+	/**
+	 * The bit index of the first GPIO pin connected to a user LED.
+	 */
+	static constexpr uint32_t FirstLED = 4;
+	/**
+	 * The bit index of the last GPIO pin connected to a user LED.
+	 */
+	static constexpr uint32_t LastLED = 11;
+	/**
+	 * The number of user LEDs.
+	 */
+	static constexpr uint32_t LEDCount = LastLED - FirstLED + 1;
+	/**
+	 * The mask covering the GPIO pins used for user LEDs.
+	 */
+	static constexpr uint32_t LEDMask = ((1 << LEDCount) - 1) << FirstLED;
+
+	/**
+	 * The output bit mask for a given user LED index
+	 */
+	constexpr static uint32_t led_bit(uint32_t index)
+	{
+		return (1 << (index + FirstLED)) & LEDMask;
+	}
+
+	/**
+	 * Switches off the LED at the given user LED index
+	 */
+	void led_on(uint32_t index) volatile
+	{
+		output = output | led_bit(index);
+	}
+
+	/**
+	 * Switches on the LED at the given user LED index
+	 */
+	void led_off(uint32_t index) volatile
+	{
+		output = output & ~led_bit(index);
+	}
+
+	/**
+	 * Toggles the LED at the given user LED index
+	 */
+	void led_toggle(uint32_t index) volatile
+	{
+		output = output ^ led_bit(index);
+	}
+
+	/**
+	 * The bit index of the first GPIO pin connected to a user switch.
+	 */
+	static constexpr uint32_t FirstSwitch = 5;
+	/**
+	 * The bit index of the last GPIO pin connected to a user switch.
+	 */
+	static constexpr uint32_t LastSwitch = 13;
+	/**
+	 * The number of user switches.
+	 */
+	static constexpr uint32_t SwitchCount = LastSwitch - FirstSwitch + 1;
+	/**
+	 * The mask covering the GPIO pins used for user switches.
+	 */
+	static constexpr uint32_t SwitchMask = ((1 << SwitchCount) - 1)
+	                                       << FirstSwitch;
+
+	/**
+	 * The input bit mask for a given user switch index
+	 */
+	constexpr static uint32_t switch_bit(uint32_t index)
+	{
+		return (1 << (index + FirstSwitch)) & SwitchMask;
+	}
+
+	/**
+	 * Returns the value of the switch at the given user switch index.
+	 */
+	bool read_switch(uint32_t index) volatile
+	{
+		return (input & switch_bit(index)) > 0;
+	}
+
+	/**
+	 * Returns the state of the joystick.
+	 */
+	SonataJoystick read_joystick() volatile
+	{
+		return static_cast<SonataJoystick>(input & 0x1f);
+	}
+};
diff --git a/sdk/include/platform/sunburst/platform-uart.hh b/sdk/include/platform/sunburst/platform-uart.hh
new file mode 100644
index 0000000..0f6d46d
--- /dev/null
+++ b/sdk/include/platform/sunburst/platform-uart.hh
@@ -0,0 +1,116 @@
+#pragma once
+#pragma push_macro("CHERIOT_PLATFORM_CUSTOM_UART")
+#define CHERIOT_PLATFORM_CUSTOM_UART
+#include_next <platform-uart.hh>
+#pragma pop_macro("CHERIOT_PLATFORM_CUSTOM_UART")
+
+/**
+ * OpenTitan UART
+ *
+ * This peripheral's source and documentation can be found at:
+ * https://github.com/lowRISC/opentitan/tree/ab878b5d3578939a04db72d4ed966a56a869b2ed/hw/ip/uart
+ *
+ * Rendered register documentation is served at:
+ * https://opentitan.org/book/hw/ip/uart/doc/registers.html
+ */
+template<unsigned DefaultBaudRate = 115'200>
+class OpenTitanUart
+{
+	public:
+	/**
+	 * Interrupt State Register.
+	 */
+	uint32_t intrState;
+	/**
+	 * Interrupt Enable Register.
+	 */
+	uint32_t intrEnable;
+	/**
+	 * Interrupt Test Register.
+	 */
+	uint32_t intrTest;
+	/**
+	 * Alert Test Register (unused).
+	 */
+	uint32_t alertTest;
+	/**
+	 * Control Register.
+	 */
+	uint32_t ctrl;
+	/**
+	 * Status Register.
+	 */
+	uint32_t status;
+	/**
+	 * UART Read Data.
+	 */
+	uint32_t rData;
+	/**
+	 * UART Write Data.
+	 */
+	uint32_t wData;
+	/**
+	 * UART FIFO Control Register.
+	 */
+	uint32_t fifoCtrl;
+	/**
+	 * UART FIFO Status Register.
+	 */
+	uint32_t fifoStatus;
+	/**
+	 * TX Pin Override Control.
+	 *
+	 * Gives direct SW control over TX pin state.
+	 */
+	uint32_t ovrd;
+	/**
+	 * UART Oversampled Values.
+	 */
+	uint32_t val;
+	/**
+	 * UART RX Timeout Control.
+	 */
+	uint32_t timeoutCtrl;
+
+	void init(unsigned baudRate = DefaultBaudRate) volatile
+	{
+		// NCO = 2^20 * baud rate / cpu frequency
+		const uint32_t NCO =
+		  ((static_cast<uint64_t>(baudRate) << 20) / CPU_TIMER_HZ);
+		// Set the baud rate and enable transmit & receive
+		ctrl = (NCO << 16) | 0b11;
+	};
+
+	bool can_write() volatile
+	{
+		return (fifoStatus & 0xff) < 32;
+	};
+
+	bool can_read() volatile
+	{
+		return ((fifoStatus >> 16) & 0xff) > 0;
+	};
+
+	/**
+	 * Write one byte, blocking until the byte is written.
+	 */
+	void blocking_write(uint8_t byte) volatile
+	{
+		while (!can_write()) {}
+		wData = byte;
+	}
+
+	/**
+	 * Read one byte, blocking until a byte is available.
+	 */
+	uint8_t blocking_read() volatile
+	{
+		while (!can_read()) {}
+		return rData;
+	}
+};
+
+#ifndef CHERIOT_PLATFORM_CUSTOM_UART
+using Uart = OpenTitanUart<>;
+static_assert(IsUart<Uart>);
+#endif
diff --git a/sdk/include/queue.h b/sdk/include/queue.h
index 162f91c..dc16e72 100644
--- a/sdk/include/queue.h
+++ b/sdk/include/queue.h
@@ -77,6 +77,20 @@
                                  size_t              elementCount);
 
 /**
+ * Destroys a queue. This wakes up all threads waiting to produce or consume,
+ * and makes them fail to acquire the lock, before deallocating the underlying
+ * allocation.
+ *
+ * This must be called on an unrestricted queue handle (*not* one returned by
+ * `queue_make_receive_handle` or `queue_make_send_handle`).
+ *
+ * Returns 0 on success. On failure, returns `-EPERM` if the queue handle is
+ * restricted (see comment above).
+ */
+int __cheri_libcall queue_destroy(struct SObjStruct  *heapCapability,
+                                  struct QueueHandle *handle);
+
+/**
  * Convert a queue handle returned from `queue_create` into one that can be
  * used *only* for receiving.
  *
diff --git a/sdk/include/stdio.h b/sdk/include/stdio.h
index 4d15320..fd6b605 100644
--- a/sdk/include/stdio.h
+++ b/sdk/include/stdio.h
@@ -5,6 +5,7 @@
 #define __STDIO_H__
 
 #include <cdefs.h>
+#include <compartment-macros.h>
 #include <stdarg.h>
 #include <stddef.h>
 
@@ -12,13 +13,56 @@
 #define EOF (-1)
 
 __BEGIN_DECLS
-int __cheri_libcall printf(const char *fmt, ...);
 
-#define name_printf(fmt, ...)                                                  \
-	printf(__XSTRING(__CHERI_COMPARTMENT__) ": " fmt, ##__VA_ARGS__)
+/**
+ * This is a very simple implementation of a subset of stdio and supports only
+ * UARTs.  The Uart type is often a C++ template type and so we can't forward
+ * declare it in a C header and so we use a volatile void* instead, which can
+ * be cast to the correct type inside the library.
+ */
+typedef volatile void FILE;
+
+#if DEVICE_EXISTS(uart0)
+#	define stdout MMIO_CAPABILITY(void, uart0)
+#	define stdin MMIO_CAPABILITY(void, uart0)
+#elif DEVICE_EXISTS(uart)
+#	define stdout MMIO_CAPABILITY(void, uart)
+#	define stdin MMIO_CAPABILITY(void uart)
+#endif
+
+#if DEVICE_EXISTS(uart1)
+#	define stderr MMIO_CAPABILITY(void, uart1)
+#elif defined(stdout)
+#	define stderr stdout
+#endif
+
+int __cheri_libcall vfprintf(FILE *stream, const char *fmt, va_list ap);
+
+static inline int fprintf(FILE *stream, const char *format, ...)
+{
+	va_list ap;
+
+	va_start(ap, format);
+	int ret = vfprintf(stream, format, ap);
+	va_end(ap);
+	return ret;
+}
+
+static inline int printf(const char *format, ...)
+{
+	va_list ap;
+
+	va_start(ap, format);
+	int ret = vfprintf(stdout, format, ap);
+	va_end(ap);
+	return ret;
+}
 
 int __cheri_libcall snprintf(char *str, size_t size, const char *format, ...);
-int __cheri_libcall vsnprintf(char *str, size_t size, const char *format, va_list ap); 
+int __cheri_libcall vsnprintf(const char *str,
+                              size_t      size,
+                              const char *format,
+                              va_list     ap);
 __END_DECLS
 
 #endif /* !__STDIO_H__ */
diff --git a/sdk/include/stdlib.h b/sdk/include/stdlib.h
index 75b6bdd..d60e4e4 100644
--- a/sdk/include/stdlib.h
+++ b/sdk/include/stdlib.h
@@ -105,6 +105,11 @@
  * or if the allocation cannot be satisfied under any circumstances (for example
  * if `size` is larger than the total heap size).
  *
+ * In both blocking and non-blocking cases, `-ENOTENOUGHSTACK` may be returned
+ * if the stack is insufficiently large to safely run the function. This means
+ * that the return value of `heap_allocate` should be checked for the validity
+ * of the tag bit *and not* nullptr.
+ *
  * Memory returned from this interface is guaranteed to be zeroed.
  */
 void *__cheri_compartment("alloc")
@@ -129,6 +134,9 @@
  * if `nmemb` * `size` is larger than the total heap size, or if `nmemb` *
  * `size` overflows).
  *
+ * Similarly to `heap_allocate`, `-ENOTENOUGHSTACK` may be returned if the
+ * stack is insufficiently large to run the function. See `heap_allocate`.
+ *
  * Memory returned from this interface is guaranteed to be zeroed.
  */
 void *__cheri_compartment("alloc")
@@ -141,8 +149,12 @@
  * Add a claim to an allocation.  The object will be counted against the quota
  * provided by the first argument until a corresponding call to `heap_free`.
  * Note that this can be used with interior pointers.
+ *
+ * This will return the size of the allocation claimed on success, 0 on error
+ * (if `heapCapability` or `pointer` is not valid, etc.), or `-ENOTENOUGHSTACK`
+ * if the stack is insufficiently large to run the function.
  */
-size_t __cheri_compartment("alloc")
+ssize_t __cheri_compartment("alloc")
   heap_claim(struct SObjStruct *heapCapability, void *pointer);
 
 /**
@@ -152,10 +164,11 @@
  * until either the next cross-compartment call or the next call to this
  * function.
  *
- * A null pointer can be used as a not-present value.  This function will
- * treat operations on null pointers as unconditionally successful.  It returns
- * `-ETIMEDOUT` if it failed to claim before the timeout expired or `EINVAL` if
- * one or more of the arguments is neither null nor a valid pointer at the end.
+ * A null pointer can be used as a not-present value.  This function will treat
+ * operations on null pointers as unconditionally successful.  It returns
+ * `-ETIMEDOUT` if it failed to claim before the timeout expired, or `EINVAL`
+ * if one or more of the arguments is neither null nor a valid pointer at the
+ * end.
  *
  * In the case of failure, neither pointer will have been claimed.
  *
@@ -169,8 +182,9 @@
 /**
  * Free a heap allocation.
  *
- * Returns 0 on success, or `-EINVAL` if `ptr` is not a valid pointer to the
- * start of a live heap allocation.
+ * Returns 0 on success, `-EINVAL` if `ptr` is not a valid pointer to the start
+ * of a live heap allocation, or `-ENOTENOUGHSTACK` if the stack size is
+ * insufficiently large to safely run the function.
  */
 int __cheri_compartment("alloc")
   heap_free(struct SObjStruct *heapCapability, void *ptr);
@@ -178,8 +192,9 @@
 /**
  * Free all allocations owned by this capability.
  *
- * Returns the number of bytes freed or `-EPERM` if this is not a valid heap
- * capability.
+ * Returns the number of bytes freed, `-EPERM` if this is not a valid heap
+ * capability, or `-ENOTENOUGHSTACK` if the stack size is insufficiently large
+ * to safely run the function.
  */
 ssize_t __cheri_compartment("alloc")
   heap_free_all(struct SObjStruct *heapCapability);
@@ -193,9 +208,10 @@
 
 /**
  * Returns the space available in the given quota. This will return -1 if
- * `heapCapability` is not valid.
+ * `heapCapability` is not valid or if the stack is insufficient to run the
+ * function.
  */
-size_t __cheri_compartment("alloc")
+ssize_t __cheri_compartment("alloc")
   heap_quota_remaining(struct SObjStruct *heapCapability);
 
 /**
@@ -238,13 +254,23 @@
 #ifndef CHERIOT_NO_AMBIENT_MALLOC
 static inline void *malloc(size_t size)
 {
-	Timeout t = {0, UnlimitedTimeout};
-	return heap_allocate(&t, MALLOC_CAPABILITY, size);
+	Timeout t   = {0, 0};
+	void   *ptr = heap_allocate(&t, MALLOC_CAPABILITY, size);
+	if (!__builtin_cheri_tag_get(ptr))
+	{
+		ptr = NULL;
+	}
+	return ptr;
 }
 static inline void *calloc(size_t nmemb, size_t size)
 {
-	Timeout t = {0, UnlimitedTimeout};
-	return heap_allocate_array(&t, MALLOC_CAPABILITY, nmemb, size);
+	Timeout t   = {0, 0};
+	void   *ptr = heap_allocate_array(&t, MALLOC_CAPABILITY, nmemb, size);
+	if (!__builtin_cheri_tag_get(ptr))
+	{
+		ptr = NULL;
+	}
+	return ptr;
 }
 static inline int free(void *ptr)
 {
diff --git a/sdk/include/switcher.h b/sdk/include/switcher.h
index ad65a42..a1088b6 100644
--- a/sdk/include/switcher.h
+++ b/sdk/include/switcher.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <cdefs.h>
+#include <stddef.h>
 
 /**
  * Returns true if the trusted stack contains at least `requiredFrames` frames
@@ -49,3 +50,9 @@
  * next cross-compartment call or until they are explicitly overwritten.
  */
 __cheri_libcall void **switcher_thread_hazard_slots(void);
+
+/**
+ * Returns the lowest address that has been stored to on the stack in this
+ * compartment invocation.
+ */
+__cheri_libcall ptraddr_t stack_lowest_used_address(void);
diff --git a/sdk/include/thread.h b/sdk/include/thread.h
index 6fe495c..3c5430a 100644
--- a/sdk/include/thread.h
+++ b/sdk/include/thread.h
@@ -91,22 +91,18 @@
 #else
 	static const uint32_t CyclesPerMicrosecond = CPU_TIMER_HZ / 1'000'000;
 	__if_cxx(
-	  static_assert(CyclesPerMicrosecond > 0, "CPU_TIMER_HZ is too low");)
-	  uint64_t start = rdcycle64();
+	  static_assert(CyclesPerMicrosecond > 0, "CPU_TIMER_HZ is too low"););
+	uint64_t start = rdcycle64();
 	// Convert the microseconds to a number of cycles.  This does the multiply
 	// first so that we don't end up with zero as a result of the division.
-	uint32_t cycles = microseconds / CyclesPerMicrosecond;
-#	ifdef X
-	Debug::log("Cycles per us: {}", CyclesPerMicrosecond);
-	Debug::log("Spinning for {} cycles", cycles);
-#	endif
-	uint64_t end = start + cycles;
+	uint32_t cycles = microseconds * CyclesPerMicrosecond;
+	uint64_t end    = start + cycles;
 	uint64_t current;
 	do
 	{
 		current = rdcycle64();
 	} while (current < end);
-	return ((current - start) * CPU_TIMER_HZ) / 1'000'000;
+	return (current - start) * CyclesPerMicrosecond;
 #endif
 }
 
@@ -127,12 +123,10 @@
 	return milliseconds;
 #else
 	static const uint32_t CyclesPerMillisecond = CPU_TIMER_HZ / 1'000;
+	static const uint32_t CyclesPerTick        = CPU_TIMER_HZ / TICK_RATE_HZ;
 	__if_cxx(
-	  static_assert(
-	    CyclesPerMillisecond > 0,
-	    "CPU_TIMER_HZ is too low");) static const uint32_t CyclesPerTick =
-	  CPU_TIMER_HZ / TICK_RATE_HZ;
-	uint32_t cycles  = CPU_TIMER_HZ * milliseconds / 1'000;
+	  static_assert(CyclesPerMillisecond > 0, "CPU_TIMER_HZ is too low"););
+	uint32_t cycles  = milliseconds * CyclesPerMillisecond;
 	uint64_t start   = rdcycle64();
 	uint64_t end     = start + cycles;
 	uint64_t current = start;
@@ -148,7 +142,7 @@
 		current = rdcycle64();
 	}
 	current = rdcycle64();
-	return ((current - start) * CPU_TIMER_HZ) / 1'000;
+	return (current - start) * CyclesPerMillisecond;
 #endif
 }
 
diff --git a/sdk/lib/debug/debug.cc b/sdk/lib/debug/debug.cc
index 123bf59..0756164 100644
--- a/sdk/lib/debug/debug.cc
+++ b/sdk/lib/debug/debug.cc
@@ -257,48 +257,58 @@
 					else
 					{
 						switch (
-						  static_cast<DebugFormatArgument::Kind>(argument.kind))
+						  static_cast<DebugFormatArgumentKind>(argument.kind))
 						{
-							case DebugFormatArgument::Kind::Bool:
+							case DebugFormatArgumentKind::
+							  DebugFormatArgumentBool:
 								write(static_cast<bool>(argument.value)
 								        ? "true"
 								        : "false");
 								break;
-							case DebugFormatArgument::Kind::Character:
+							case DebugFormatArgumentKind::
+							  DebugFormatArgumentCharacter:
 								write(static_cast<char>(argument.value));
 								break;
-							case DebugFormatArgument::Kind::Pointer:
+							case DebugFormatArgumentKind::
+							  DebugFormatArgumentPointer:
 								write(reinterpret_cast<void *>(argument.value));
 								break;
-							case DebugFormatArgument::Kind::SignedNumber32:
+							case DebugFormatArgumentKind::
+							  DebugFormatArgumentSignedNumber32:
 								write(static_cast<int32_t>(argument.value));
 								break;
-							case DebugFormatArgument::Kind::UnsignedNumber32:
+							case DebugFormatArgumentKind::
+							  DebugFormatArgumentUnsignedNumber32:
 								write(static_cast<uint32_t>(argument.value));
 								break;
-							case DebugFormatArgument::Kind::SignedNumber64:
+							case DebugFormatArgumentKind::
+							  DebugFormatArgumentSignedNumber64:
 							{
 								int64_t value;
 								memcpy(&value, &argument.value, sizeof(value));
 								write(value);
 								break;
 							}
-							case DebugFormatArgument::Kind::UnsignedNumber64:
+							case DebugFormatArgumentKind::
+							  DebugFormatArgumentUnsignedNumber64:
 							{
 								uint64_t value;
 								memcpy(&value, &argument.value, sizeof(value));
 								write(value);
 								break;
 							}
-							case DebugFormatArgument::Kind::CString:
+							case DebugFormatArgumentKind::
+							  DebugFormatArgumentCString:
 								write(reinterpret_cast<const char *>(
 								  argument.value));
 								break;
-							case DebugFormatArgument::Kind::StringView:
+							case DebugFormatArgumentKind::
+							  DebugFormatArgumentStringView:
 								write(*reinterpret_cast<std::string_view *>(
 								  argument.value));
 								break;
-							case DebugFormatArgument::Kind::PermissionSet:
+							case DebugFormatArgumentKind::
+							  DebugFormatArgumentPermissionSet:
 								write(CHERI::PermissionSet::from_raw(
 								  argument.value));
 								break;
diff --git a/sdk/lib/event_group/event_group.cc b/sdk/lib/event_group/event_group.cc
index ef28ba8..04d0337 100644
--- a/sdk/lib/event_group/event_group.cc
+++ b/sdk/lib/event_group/event_group.cc
@@ -46,7 +46,7 @@
 	auto   group =
 	  static_cast<EventGroup *>(heap_allocate(timeout, heapCapability, size));
 	*outGroup = group;
-	if (!group)
+	if (!__builtin_cheri_tag_get(group))
 	{
 		return -ENOMEM;
 	}
@@ -177,19 +177,26 @@
 	return 0;
 }
 
-int eventgroup_destroy(SObjStruct *heapCapability, EventGroup *group)
+int eventgroup_destroy_force(SObjStruct *heapCapability, EventGroup *group)
 {
-	group->lock.lock();
+	group->lock.upgrade_for_destruction();
 	// Force all waiters to wake.
 	for (size_t i = 0; i < group->waiterCount; ++i)
 	{
-		auto &waiter = group->waiters[i];
-		if (waiter.bitsWanted == 0)
-		{
-			continue;
-		}
+		// Notifying is not enough, we need to ensure that waiters get
+		// appropriate bits to leave the loop.
+		auto    &waiter   = group->waiters[i];
+		uint32_t bits     = waiter.bitsWanted;
+		waiter.bitsWanted = 0;
+		waiter.bitsSeen   = bits;
 		waiter.bitsSeen.notify_one();
 	}
 	heap_free(heapCapability, group);
 	return 0;
 }
+
+int eventgroup_destroy(SObjStruct *heapCapability, EventGroup *group)
+{
+	group->lock.lock();
+	return eventgroup_destroy_force(heapCapability, group);
+}
diff --git a/sdk/lib/locks/locks.cc b/sdk/lib/locks/locks.cc
index f1081d8..5cd407c 100644
--- a/sdk/lib/locks/locks.cc
+++ b/sdk/lib/locks/locks.cc
@@ -128,7 +128,9 @@
 		 */
 		void unlock()
 		{
-			auto old = lockWord.exchange(Flag::Unlocked);
+			// Atomically empty all bits of the lockword except the
+			// destruct mode bit which we want to preserve.
+			auto old = lockWord.fetch_and(Flag::LockedInDestructMode);
 
 			// Assert that the locked is not already locked, and
 			// that the caller holds the lock. This is only
@@ -152,38 +154,28 @@
 
 		/**
 		 * Set the destruction bit in the flag lock word and wake
-		 * waiters. Assumes that the lock is held by the caller.
-		 *
-		 * Note: This does not check that the lock is owned by the
-		 * calling thread.
+		 * waiters. Callers do not need to hold the lock.
 		 */
 		void upgrade_for_destruction()
 		{
 			Debug::log("Setting {} for destruction", &lockWord);
 
-			// Assert that the caller holds the lock. This is only
-			// compiled-in with debugging enabled for locking.
-			Debug::Assert(
-			  (lockWord & 0x0000ffff) == thread_id_get(),
-			  "Calling thread {} does not hold the lock on {} (owner: {})",
-			  thread_id_get(),
-			  &lockWord,
-			  lockWord & 0x0000ffff);
-
-			// Set the destruction bit.
+			// Atomically set the destruction bit.
 			lockWord |= Flag::LockedInDestructMode;
 
 			// Wake up waiters.
 			// There should not be any 'missed wake' because any
 			// thread calling lock() concurrently will either:
-			// - successfully CAS, pass the the -ENOENT check, reach
+			// - successfully CAS and take the lock (i.e., not got
+			//   to sleep)
+			// - fail the CAS, and fail the -ENOENT check;
+			// - fail the CAS, pass the the -ENOENT check, reach
 			//   the futex `wait` before we set the
 			//   `Flag::LockedInDestructMode` bit, and be woken up
 			//   by our `notify_all`;
-			// - fail the -ENOENT check;
-			// - miss the `Flag::LockedInDestructMode` bit, pass the
-			//   -ENOENT check but fail their CAS to add the waiters
-			//   bit, retry, and fail the -ENOENT check;
+			// - miss the `Flag::LockedInDestructMode` bit, fail
+			//   the CAS, pass the -ENOENT check, fail the CAS to add
+			//   the waiters bit, retry, and fail the -ENOENT check;
 			// - or miss the `Flag::LockedInDestructMode` bit, pass
 			//   the -ENOENT check but fail their CAS in the futex
 			//   `wait` call, retry (because the futex returns 0 in
diff --git a/sdk/lib/queue/queue.cc b/sdk/lib/queue/queue.cc
index 1b0b965..5a1caf6 100644
--- a/sdk/lib/queue/queue.cc
+++ b/sdk/lib/queue/queue.cc
@@ -236,8 +236,9 @@
 		/**
 		 * The bit to use for the lock.
 		 */
-		static constexpr uint32_t LockBit    = 1U << 31;
-		static constexpr uint32_t WaitersBit = 1U << 30;
+		static constexpr uint32_t LockBit                 = 1U << 31;
+		static constexpr uint32_t WaitersBit              = 1U << 30;
+		static constexpr uint32_t LockedInDestructModeBit = 1U << 29;
 
 		// Function required to conform to the Lock concept.
 		void lock()
@@ -247,7 +248,7 @@
 
 		static constexpr uint32_t reserved_bits()
 		{
-			return LockBit | WaitersBit;
+			return LockBit | WaitersBit | LockedInDestructModeBit;
 		}
 
 		/**
@@ -259,6 +260,10 @@
 			do
 			{
 				value = lockWord.load();
+				if ((value & LockedInDestructModeBit) != 0)
+				{
+					return false;
+				}
 				if (value & LockBit)
 				{
 					// If the lock is held, set the flag that indicates that
@@ -296,6 +301,20 @@
 				lockWord.notify_all();
 			}
 		}
+
+		/**
+		 * Set the lock in destruction mode. This has the same
+		 * semantics as `flaglock_upgrade_for_destruction`.
+		 */
+		void upgrade_for_destruction()
+		{
+			// Atomically set the destruction bit.
+			lockWord |= LockedInDestructModeBit;
+			if (lockWord & WaitersBit)
+			{
+				lockWord.notify_all();
+			}
+		}
 	};
 
 	uint32_t counter_load(std::atomic<uint32_t> *counter)
@@ -338,12 +357,35 @@
 		heap_claim_fast(&t, nullptr, nullptr);
 	}
 
+	void bound_queue_buffer(struct QueueHandle handle)
+	{
+		Capability buffer   = handle.buffer;
+		Capability producer = handle.producer;
+		// Restrict the bounds using the address of the producer, which
+		// comes immediately after the queue buffer. This should be
+		// strictly equivalent to calculating the size of the queue
+		// from the number of elements and the size of elements (we
+		// assert that below).
+		buffer.bounds() = producer.address() - buffer.address();
+		Debug::Assert(
+		  [&]() -> bool {
+			  size_t bufferSize;
+			  bool   overflow = __builtin_mul_overflow(
+			      handle.queueSize, handle.elementSize, &bufferSize);
+			  bufferSize = CHERI::representable_length(bufferSize);
+			  return (!overflow) && (buffer.bounds() == bufferSize);
+		  },
+		  "Mismatch between the size of the queue as reported by `queueSize` "
+		  "and `elementSize` and its real size.");
+	}
+
 } // namespace
 
 struct QueueHandle queue_make_receive_handle(struct QueueHandle handle)
 {
 	Capability buffer   = handle.buffer;
 	Capability producer = handle.producer;
+	bound_queue_buffer(handle);
 	buffer.permissions() &= ReadOnlyCapability;
 	producer.permissions() &= ReadOnly;
 	handle.buffer   = buffer;
@@ -355,6 +397,7 @@
 {
 	Capability buffer   = handle.buffer;
 	Capability consumer = handle.consumer;
+	bound_queue_buffer(handle);
 	buffer.permissions() &= WriteOnlyCapability;
 	consumer.permissions() &= ReadOnly;
 	handle.buffer   = buffer;
@@ -362,6 +405,36 @@
 	return handle;
 }
 
+int queue_destroy(struct SObjStruct *heapCapability, struct QueueHandle *handle)
+{
+	int ret = 0;
+	// Only upgrade the locks for destruction if we know that we will be
+	// able to free the queue at the end. This will fail if passed a
+	// restricted buffer, which will happen if `queue_destroy` is called on
+	// a restricted queue.
+	if (ret = heap_can_free(heapCapability, handle->buffer); ret != 0)
+	{
+		return ret;
+	}
+
+	auto           *producer = handle->producer;
+	HighBitFlagLock producerLock{*producer};
+	producerLock.upgrade_for_destruction();
+
+	auto           *consumer = handle->consumer;
+	HighBitFlagLock consumerLock{*consumer};
+	consumerLock.upgrade_for_destruction();
+
+	// This should not fail because of the `heap_can_free` check, unless we
+	// run out of stack.
+	if (ret = heap_free(heapCapability, handle->buffer); ret != 0)
+	{
+		return ret;
+	}
+
+	return ret;
+}
+
 int queue_create(Timeout            *timeout,
                  struct SObjStruct  *heapCapability,
                  struct QueueHandle *outQueue,
@@ -390,7 +463,7 @@
 	// We need the counters to be able to run to double the queue size without
 	// hitting the high bits.  Error if this is the case.
 	//
-	// This should never be reached: a queue needs to be at least 512 MiB
+	// This should never be reached: a queue needs to be at least 256 MiB
 	// (assuming one-byte elements) to hit this limit.
 	if (((elementCount | (elementCount * 2)) &
 	     HighBitFlagLock::reserved_bits()) != 0)
@@ -416,8 +489,7 @@
 	producer.bounds() = CounterSize;
 	consumer.bounds() = CounterSize;
 	// The pointer used to free the allocation
-	*outAllocation  = buffer;
-	buffer.bounds() = bufferSize;
+	*outAllocation = buffer;
 	Debug::log("Created queue with buffer: {}", buffer);
 	// The handle
 	*outQueue = {elementSize, elementCount, buffer, producer, consumer};
diff --git a/sdk/lib/queue/queue_compartment.cc b/sdk/lib/queue/queue_compartment.cc
index e43876b..96f4845 100644
--- a/sdk/lib/queue/queue_compartment.cc
+++ b/sdk/lib/queue/queue_compartment.cc
@@ -78,6 +78,7 @@
 	receive->allocation = freeBuffer;
 	// Add a second claim on the buffer so that we can free the queue by freeing
 	// it twice, once in each endpoint.
+	// TODO we should check the return value of `heap_claim`
 	heap_claim(heapCapability, freeBuffer);
 
 	if (int claimed = heap_claim_fast(timeout, outQueueSend, outQueueReceive);
diff --git a/sdk/lib/stdio/printf.c b/sdk/lib/stdio/printf.c
deleted file mode 100644
index fc210a1..0000000
--- a/sdk/lib/stdio/printf.c
+++ /dev/null
@@ -1,558 +0,0 @@
-/*-
- * Copyright (c) 1986, 1988, 1991, 1993
- *	The Regents of the University of California.  All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)subr_prf.c	8.3 (Berkeley) 1/21/94
- */
-
-/* __FBSDID("$FreeBSD: stable/8/sys/kern/subr_prf.c 210305 2010-07-20 18:55:13Z
- * jkim $"); */
-
-#include <cdefs.h>
-#include <cheri-builtins.h>
-#include <compartment.h>
-#include <inttypes.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <string.h>
-
-/*
- * Definitions snarfed from various parts of the FreeBSD headers:
- */
-#define toupper(c) ((c)-0x20 * (((c) >= 'a') && ((c) <= 'z')))
-
-static char hex2ascii(uintmax_t in)
-{
-	return (in < 10) ? '0' + in : 'a' + in - 10;
-}
-
-/* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */
-#define MAXNBUF (sizeof(intmax_t) * CHAR_BIT + 1)
-
-struct snprintf_arg
-{
-	char * str;
-	size_t remain;
-};
-
-static void snprintf_func(int ch, void *arg)
-{
-	struct snprintf_arg *const info = arg;
-
-	if (info->remain >= 2)
-	{
-		*info->str++ = ch;
-		info->remain--;
-	}
-}
-
-/*
- * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse
- * order; return an optional length and a pointer to the last character
- * written in the buffer (i.e., the first character of the string).
- * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
- */
-// FIXME: Using `unsigned` for `num` instead of `uintmax_t` means that we are
-// going to truncate large numbers, but it avoids needing a library routine to
-// handle division.
-static char *ksprintn(char *nbuf, unsigned num, int base, int *lenp, int upper)
-{
-	char *p, c;
-
-	p  = nbuf;
-	*p = '\0';
-	do
-	{
-		c    = hex2ascii(num % base);
-		*++p = upper ? toupper(c) : c;
-	} while (num /= base);
-	if (lenp)
-		*lenp = p - nbuf;
-	return (p);
-}
-
-/*
- * Scaled down version of printf(3).
- *
- * Two additional formats:
- *
- * The format %b is supported to decode error registers.
- * Its usage is:
- *
- *	printf("reg=%b\n", regval, "<base><arg>*");
- *
- * where <base> is the output base expressed as a control character, e.g.
- * \10 gives octal; \20 gives hex.  Each arg is a sequence of characters,
- * the first of which gives the bit number to be inspected (origin 1), and
- * the next characters (up to a control character, i.e. a character <= 32),
- * give the name of the register.  Thus:
- *
- *	kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
- *
- * would produce output:
- *
- *	reg=3<BITTWO,BITONE>
- *
- * %D  -- Hexdump, takes pointer and separator string:
- *		("%6D", ptr, ":")   -> XX:XX:XX:XX:XX:XX
- *		("%*D", len, ptr, " " -> XX XX XX XX ...
- */
-static int kvprintf(char const *fmt,
-                    void (*func)(int, void *),
-                    void *  arg,
-                    int     radix,
-                    va_list ap)
-{
-#define PCHAR(c)                                                               \
-	{                                                                          \
-		int cc = (c);                                                          \
-		if (func)                                                              \
-			(*func)(cc, arg);                                                  \
-		else                                                                   \
-			*d++ = cc;                                                         \
-		retval++;                                                              \
-	}
-	char           nbuf[MAXNBUF];
-	char *         d;
-	const char *   p, *percent, *q;
-	unsigned char *up;
-	int            ch, n;
-	uintmax_t      num;
-	int  base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
-	int  cflag, hflag, jflag, tflag, zflag;
-	int  dwidth, upper;
-	char padc;
-	int  stop = 0, retval = 0;
-	char null_str[1] = {'\0'};
-
-	num = 0;
-	if (!func)
-		d = (char *)arg;
-	else
-		d = NULL;
-
-	if (fmt == NULL)
-		fmt = null_str;
-
-	if (radix < 2 || radix > 36)
-		radix = 10;
-
-	for (;;)
-	{
-		padc  = ' ';
-		width = 0;
-		while ((ch = (unsigned char)*fmt++) != '%' || stop)
-		{
-			if (ch == '\0')
-				return (retval);
-			PCHAR(ch);
-		}
-		percent   = fmt - 1;
-		qflag     = 0;
-		lflag     = 0;
-		ladjust   = 0;
-		sharpflag = 0;
-		neg       = 0;
-		sign      = 0;
-		dot       = 0;
-		dwidth    = 0;
-		upper     = 0;
-		cflag     = 0;
-		hflag     = 0;
-		jflag     = 0;
-		tflag     = 0;
-		zflag     = 0;
-	reswitch:
-		switch (ch = (unsigned char)*fmt++)
-		{
-			case '.':
-				dot = 1;
-				goto reswitch;
-			case '#':
-				sharpflag = 1;
-				goto reswitch;
-			case '+':
-				sign = 1;
-				goto reswitch;
-			case '-':
-				ladjust = 1;
-				goto reswitch;
-			case '%':
-				PCHAR(ch);
-				break;
-			case '*':
-				if (!dot)
-				{
-					width = va_arg(ap, int);
-					if (width < 0)
-					{
-						ladjust = !ladjust;
-						width   = -width;
-					}
-				}
-				else
-				{
-					dwidth = va_arg(ap, int);
-				}
-				goto reswitch;
-			case '0':
-				if (!dot)
-				{
-					padc = '0';
-					goto reswitch;
-				}
-			case '1':
-			case '2':
-			case '3':
-			case '4':
-			case '5':
-			case '6':
-			case '7':
-			case '8':
-			case '9':
-				for (n = 0;; ++fmt)
-				{
-					n  = n * 10 + ch - '0';
-					ch = *fmt;
-					if (ch < '0' || ch > '9')
-						break;
-				}
-				if (dot)
-					dwidth = n;
-				else
-					width = n;
-				goto reswitch;
-			case 'b':
-				num = (unsigned int)va_arg(ap, int);
-				p   = va_arg(ap, char *);
-				for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;)
-					PCHAR(*q--);
-
-				if (num == 0)
-					break;
-
-				for (tmp = 0; *p;)
-				{
-					n = *p++;
-					if (num & (1 << (n - 1)))
-					{
-						PCHAR(tmp ? ',' : '<');
-						for (; (n = *p) > ' '; ++p)
-							PCHAR(n);
-						tmp = 1;
-					}
-					else
-						for (; *p > ' '; ++p)
-							continue;
-				}
-				if (tmp)
-					PCHAR('>');
-				break;
-			case 'c':
-				PCHAR(va_arg(ap, int));
-				break;
-			case 'D':
-				up = va_arg(ap, unsigned char *);
-				p  = va_arg(ap, char *);
-				if (!width)
-					width = 16;
-				while (width--)
-				{
-					PCHAR(hex2ascii(*up >> 4));
-					PCHAR(hex2ascii(*up & 0x0f));
-					up++;
-					if (width)
-						for (q = p; *q; q++)
-							PCHAR(*q);
-				}
-				break;
-			case 'd':
-			case 'i':
-				base = 10;
-				sign = 1;
-				goto handle_sign;
-			case 'h':
-				if (hflag)
-				{
-					hflag = 0;
-					cflag = 1;
-				}
-				else
-					hflag = 1;
-				goto reswitch;
-			case 'j':
-				jflag = 1;
-				goto reswitch;
-			case 'l':
-				if (lflag)
-				{
-					lflag = 0;
-					qflag = 1;
-				}
-				else
-					lflag = 1;
-				goto reswitch;
-			case 'n':
-				if (jflag)
-					*(va_arg(ap, intmax_t *)) = retval;
-				else if (qflag)
-					*(va_arg(ap, long long *)) = retval;
-				else if (lflag)
-					*(va_arg(ap, long *)) = retval;
-				else if (zflag)
-					*(va_arg(ap, size_t *)) = retval;
-				else if (hflag)
-					*(va_arg(ap, short *)) = retval;
-				else if (cflag)
-					*(va_arg(ap, char *)) = retval;
-				else
-					*(va_arg(ap, int *)) = retval;
-				break;
-			case 'o':
-				base = 8;
-				goto handle_nosign;
-			case 'p':
-				base      = 16;
-				sharpflag = (width == 0);
-				sign      = 0;
-				num       = (size_t)va_arg(ap, void *);
-				goto number;
-			case 'q':
-				qflag = 1;
-				goto reswitch;
-			case 'r':
-				base = radix;
-				if (sign)
-					goto handle_sign;
-				goto handle_nosign;
-			case 's':
-				p = va_arg(ap, char *);
-				if (p == NULL)
-					p = null_str;
-				if (!dot)
-					n = strlen(p);
-				else
-					for (n = 0; n < dwidth && p[n]; n++)
-						continue;
-
-				width -= n;
-
-				if (!ladjust && width > 0)
-					while (width--)
-						PCHAR(padc);
-				while (n--)
-					PCHAR(*p++);
-				if (ladjust && width > 0)
-					while (width--)
-						PCHAR(padc);
-				break;
-			case 't':
-				tflag = 1;
-				goto reswitch;
-			case 'u':
-				base = 10;
-				goto handle_nosign;
-			case 'X':
-				upper = 1;
-			case 'x':
-				base = 16;
-				goto handle_nosign;
-			case 'y':
-				base = 16;
-				sign = 1;
-				goto handle_sign;
-			case 'z':
-				zflag = 1;
-				goto reswitch;
-			handle_nosign:
-				sign = 0;
-				if (jflag)
-					num = va_arg(ap, uintmax_t);
-				else if (qflag)
-					num = va_arg(ap, unsigned long long);
-				else if (tflag)
-					num = va_arg(ap, ptrdiff_t);
-				else if (lflag)
-					num = va_arg(ap, unsigned long);
-				else if (zflag)
-					num = va_arg(ap, size_t);
-				else if (hflag)
-					num = (unsigned short)va_arg(ap, int);
-				else if (cflag)
-					num = (unsigned char)va_arg(ap, int);
-				else
-					num = va_arg(ap, unsigned int);
-				goto number;
-			handle_sign:
-				if (jflag)
-					num = va_arg(ap, intmax_t);
-				else if (qflag)
-					num = va_arg(ap, long long);
-				else if (tflag)
-					num = va_arg(ap, ptrdiff_t);
-				else if (lflag)
-					num = va_arg(ap, long);
-				else if (zflag)
-					num = va_arg(ap, ssize_t);
-				else if (hflag)
-					num = (short)va_arg(ap, int);
-				else if (cflag)
-					num = (char)va_arg(ap, int);
-				else
-					num = va_arg(ap, int);
-			number:
-				if (sign && (intmax_t)num < 0)
-				{
-					neg = 1;
-					num = -(intmax_t)num;
-				}
-				p   = ksprintn(nbuf, num, base, &n, upper);
-				tmp = 0;
-				if (sharpflag && num != 0)
-				{
-					if (base == 8)
-						tmp++;
-					else if (base == 16)
-						tmp += 2;
-				}
-				if (neg)
-					tmp++;
-
-				if (!ladjust && padc == '0')
-					dwidth = width - tmp;
-				width -= tmp + (dwidth > n ? dwidth : n);
-				dwidth -= n;
-				if (!ladjust)
-					while (width-- > 0)
-						PCHAR(' ');
-				if (neg)
-					PCHAR('-');
-				if (sharpflag && num != 0)
-				{
-					if (base == 8)
-					{
-						PCHAR('0');
-					}
-					else if (base == 16)
-					{
-						PCHAR('0');
-						PCHAR('x');
-					}
-				}
-				while (dwidth-- > 0)
-					PCHAR('0');
-
-				while (*p)
-					PCHAR(*p--);
-
-				if (ladjust)
-					while (width-- > 0)
-						PCHAR(' ');
-
-				break;
-			default:
-				while (percent < fmt)
-					PCHAR(*percent++);
-				/*
-				 * Since we ignore an formatting argument it is no
-				 * longer safe to obey the remaining formatting
-				 * arguments as the arguments will no longer match
-				 * the format specs.
-				 */
-				stop = 1;
-				break;
-		}
-	}
-#undef PCHAR
-}
-
-/*
- * Scaled down version of vsnprintf(3).
- */
-int __cheri_libcall vsnprintf(char *str, size_t size, const char *format, va_list ap)
-{
-	struct snprintf_arg info;
-	int                 retval;
-
-	info.str    = str;
-	info.remain = size;
-	retval      = kvprintf(format, snprintf_func, &info, 10, ap);
-	if (info.remain >= 1)
-		*info.str++ = '\0';
-	return (retval);
-}
-
-#define UART_REG_QUEUE 0
-#define UART_REG_LINESTAT (5)
-#define UART_REG_STATUS_RX (0x01)
-#define UART_REG_STATUS_TX (0x20)
-
-
-[[cheri::interrupt_state(disabled)]]
-static void uart16550_txbuffer(const char *ptr)
-{
-	long               flags;
-	volatile uint32_t *uart16550 = MMIO_CAPABILITY(uint32_t, uart);
-
-	for (int i = 0; i < PRT_MAX_SIZE && ptr[i]; i++)
-	{
-		while ((uart16550[UART_REG_LINESTAT] & UART_REG_STATUS_TX) == 0) {}
-
-		uart16550[UART_REG_QUEUE] = ptr[i];
-	}
-}
-
-int __cheri_libcall printf(const char *format, ...)
-{
-	char    buf[PRT_MAX_SIZE];
-	int     rv;
-	va_list ap;
-
-	va_start(ap, format);
-	rv = vsnprintf(buf, PRT_MAX_SIZE, format, ap);
-	va_end(ap);
-	uart16550_txbuffer(buf);
-
-	return rv;
-}
-
-int __cheri_libcall snprintf(char *str, size_t size, const char *format, ...)
-{
-	int     rv;
-	va_list ap;
-
-	va_start(ap, format);
-	rv = vsnprintf(str, size, format, ap);
-	va_end(ap);
-
-	return rv;
-}
diff --git a/sdk/lib/stdio/printf.cc b/sdk/lib/stdio/printf.cc
new file mode 100644
index 0000000..8fc5f51
--- /dev/null
+++ b/sdk/lib/stdio/printf.cc
@@ -0,0 +1,552 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_prf.c	8.3 (Berkeley) 1/21/94
+ */
+
+/* __FBSDID("$FreeBSD: stable/8/sys/kern/subr_prf.c 210305 2010-07-20 18:55:13Z
+ * jkim $"); */
+
+#include <cdefs.h>
+#include <cheri-builtins.h>
+#include <compartment.h>
+#include <cstdint>
+#include <function_wrapper.hh>
+#include <inttypes.h>
+#include <platform-uart.hh>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+/*
+ * Definitions snarfed from various parts of the FreeBSD headers:
+ */
+namespace
+{
+	__always_inline char toupper(char c)
+	{
+		return ((c)-0x20 * (((c) >= 'a') && ((c) <= 'z')));
+	}
+
+	static char hex2ascii(uintmax_t in)
+	{
+		return (in < 10) ? '0' + in : 'a' + in - 10;
+	}
+
+/* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */
+#define MAXNBUF (sizeof(intmax_t) * CHAR_BIT + 1)
+
+	/*
+	 * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse
+	 * order; return an optional length and a pointer to the last character
+	 * written in the buffer (i.e., the first character of the string).
+	 * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
+	 */
+	// FIXME: Using `unsigned` for `num` instead of `uintmax_t` means that we
+	// are going to truncate large numbers, but it avoids needing a library
+	// routine to handle division.
+	static char *
+	ksprintn(char *nbuf, unsigned num, int base, int *lenp, int upper)
+	{
+		char *p, c;
+
+		p  = nbuf;
+		*p = '\0';
+		do
+		{
+			c    = hex2ascii(num % base);
+			*++p = upper ? toupper(c) : c;
+		} while (num /= base);
+		if (lenp)
+			*lenp = p - nbuf;
+		return (p);
+	}
+
+	/*
+	 * Scaled down version of printf(3).
+	 *
+	 * Two additional formats:
+	 *
+	 * The format %b is supported to decode error registers.
+	 * Its usage is:
+	 *
+	 *	printf("reg=%b\n", regval, "<base><arg>*");
+	 *
+	 * where <base> is the output base expressed as a control character, e.g.
+	 * \10 gives octal; \20 gives hex.  Each arg is a sequence of characters,
+	 * the first of which gives the bit number to be inspected (origin 1), and
+	 * the next characters (up to a control character, i.e. a character <= 32),
+	 * give the name of the register.  Thus:
+	 *
+	 *	kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
+	 *
+	 * would produce output:
+	 *
+	 *	reg=3<BITTWO,BITONE>
+	 *
+	 * %D  -- Hexdump, takes pointer and separator string:
+	 *		("%6D", ptr, ":")   -> XX:XX:XX:XX:XX:XX
+	 *		("%*D", len, ptr, " " -> XX XX XX XX ...
+	 */
+	__noinline int kvprintf(char const                *fmt,
+	                        FunctionWrapper<void(int)> func,
+	                        void                      *arg,
+	                        int                        radix,
+	                        va_list                    ap)
+	{
+		char           nbuf[MAXNBUF];
+		const char    *p, *percent, *q;
+		unsigned char *up;
+		int            ch, n;
+		uintmax_t      num;
+		int  base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
+		int  cflag, hflag, jflag, tflag, zflag;
+		int  dwidth, upper;
+		char padc;
+		int  stop = 0, retval = 0;
+		const char *emptyString = "";
+
+		auto putchar = [&](int c) {
+			func(c);
+			retval++;
+		};
+
+		if (fmt == nullptr)
+		{
+			fmt = emptyString;
+		}
+
+		if (radix < 2 || radix > 36)
+		{
+			radix = 10;
+		}
+
+		for (;;)
+		{
+			padc  = ' ';
+			width = 0;
+			while ((ch = static_cast<unsigned char>(*fmt++)) != '%' || stop)
+			{
+				if (ch == '\0')
+					return (retval);
+				putchar(ch);
+			}
+			percent   = fmt - 1;
+			qflag     = 0;
+			lflag     = 0;
+			ladjust   = 0;
+			sharpflag = 0;
+			neg       = 0;
+			sign      = 0;
+			dot       = 0;
+			dwidth    = 0;
+			upper     = 0;
+			cflag     = 0;
+			hflag     = 0;
+			jflag     = 0;
+			tflag     = 0;
+			zflag     = 0;
+		reswitch:
+			switch (ch = static_cast<unsigned char>(*fmt++))
+			{
+				case '.':
+					dot = 1;
+					goto reswitch; // NOLINT
+				case '#':
+					sharpflag = 1;
+					goto reswitch; // NOLINT
+				case '+':
+					sign = 1;
+					goto reswitch; // NOLINT
+				case '-':
+					ladjust = 1;
+					goto reswitch; // NOLINT
+				case '%':
+					putchar(ch);
+					break;
+				case '*':
+					if (!dot)
+					{
+						width = va_arg(ap, int);
+						if (width < 0)
+						{
+							ladjust = !ladjust;
+							width   = -width;
+						}
+					}
+					else
+					{
+						dwidth = va_arg(ap, int);
+					}
+					goto reswitch; // NOLINT
+				case '0':
+					if (!dot)
+					{
+						padc = '0';
+						goto reswitch; // NOLINT
+					}
+				case '1':
+				case '2':
+				case '3':
+				case '4':
+				case '5':
+				case '6':
+				case '7':
+				case '8':
+				case '9':
+					for (n = 0;; ++fmt)
+					{
+						n  = n * 10 + ch - '0';
+						ch = *fmt;
+						if (ch < '0' || ch > '9')
+							break;
+					}
+					if (dot)
+						dwidth = n;
+					else
+						width = n;
+					goto reswitch; // NOLINT
+				case 'b':
+					num = static_cast<unsigned int>(va_arg(ap, int));
+					p   = va_arg(ap, char *);
+					for (q = ksprintn(nbuf, num, *p++, nullptr, 0); *q;)
+						putchar(*q--);
+
+					if (num == 0)
+						break;
+
+					for (tmp = 0; *p;)
+					{
+						n = *p++;
+						if (num & (1 << (n - 1)))
+						{
+							putchar(tmp ? ',' : '<');
+							for (; (n = *p) > ' '; ++p)
+								putchar(n);
+							tmp = 1;
+						}
+						else
+							for (; *p > ' '; ++p)
+								continue;
+					}
+					if (tmp)
+						putchar('>');
+					break;
+				case 'c':
+					putchar(va_arg(ap, int));
+					break;
+				case 'D':
+					up = va_arg(ap, unsigned char *);
+					p  = va_arg(ap, char *);
+					if (!width)
+						width = 16;
+					while (width--)
+					{
+						putchar(hex2ascii(*up >> 4));
+						putchar(hex2ascii(*up & 0x0f));
+						up++;
+						if (width)
+							for (q = p; *q; q++)
+								putchar(*q);
+					}
+					break;
+				case 'd':
+				case 'i':
+					base = 10;
+					sign = 1;
+					goto handle_sign; // NOLINT
+				case 'h':
+					if (hflag)
+					{
+						hflag = 0;
+						cflag = 1;
+					}
+					else
+						hflag = 1;
+					goto reswitch; // NOLINT
+				case 'j':
+					jflag = 1;
+					goto reswitch; // NOLINT
+				case 'l':
+					if (lflag)
+					{
+						lflag = 0;
+						qflag = 1;
+					}
+					else
+						lflag = 1;
+					goto reswitch; // NOLINT
+				case 'n':
+					if (jflag)
+						*(va_arg(ap, intmax_t *)) = retval;
+					else if (qflag)
+						*(va_arg(ap, long long *)) = retval;
+					else if (lflag)
+						*(va_arg(ap, long *)) = retval;
+					else if (zflag)
+						*(va_arg(ap, size_t *)) = retval;
+					else if (hflag)
+						*(va_arg(ap, short *)) = static_cast<short>(retval);
+					else if (cflag)
+						*(va_arg(ap, char *)) = retval;
+					else
+						*(va_arg(ap, int *)) = retval;
+					break;
+				case 'o':
+					base = 8;
+					goto handle_nosign; // NOLINT
+				case 'p':
+					base      = 16;
+					sharpflag = (width == 0);
+					sign      = 0;
+					num       = static_cast<size_t>(
+                      reinterpret_cast<uintptr_t>(va_arg(ap, void *)));
+					goto number; // NOLINT
+				case 'q':
+					qflag = 1;
+					goto reswitch; // NOLINT
+				case 'r':
+					base = radix;
+					if (sign)
+						goto handle_sign; // NOLINT
+					goto handle_nosign;   // NOLINT
+				case 's':
+					p = va_arg(ap, char *);
+					if (p == nullptr)
+					{
+						p = emptyString;
+					}
+					if (!dot)
+					{
+						n = strlen(p);
+					}
+					else
+					{
+						for (n = 0; n < dwidth && p[n]; n++)
+						{
+							continue;
+						}
+					}
+
+					width -= n;
+
+					if (!ladjust && width > 0)
+					{
+						while (width--)
+						{
+							putchar(padc);
+						}
+					}
+					while (n--)
+					{
+						putchar(*p++);
+					}
+					if (ladjust && width > 0)
+					{
+						while (width--)
+						{
+							putchar(padc);
+						}
+					}
+					break;
+				case 't':
+					tflag = 1;
+					goto reswitch; // NOLINT
+				case 'u':
+					base = 10;
+					goto handle_nosign; // NOLINT
+				case 'X':
+					upper = 1;
+				case 'x':
+					base = 16;
+					goto handle_nosign; // NOLINT
+				case 'y':
+					base = 16;
+					sign = 1;
+					goto handle_sign; // NOLINT
+				case 'z':
+					zflag = 1;
+					goto reswitch; // NOLINT
+				handle_nosign:
+					sign = 0;
+					if (jflag)
+						num = va_arg(ap, uintmax_t);
+					else if (qflag)
+						num = va_arg(ap, unsigned long long);
+					else if (tflag)
+						num = va_arg(ap, ptrdiff_t);
+					else if (lflag)
+						num = va_arg(ap, unsigned long);
+					else if (zflag)
+						num = va_arg(ap, size_t);
+					else if (hflag)
+						num = static_cast<unsigned short>(va_arg(ap, int));
+					else if (cflag)
+						num = static_cast<unsigned char>(va_arg(ap, int));
+					else
+						num = va_arg(ap, unsigned int);
+					goto number; // NOLINT
+				handle_sign:
+					if (jflag)
+						num = va_arg(ap, intmax_t);
+					else if (qflag)
+						num = va_arg(ap, long long);
+					else if (tflag)
+						num = va_arg(ap, ptrdiff_t);
+					else if (lflag)
+						num = va_arg(ap, long);
+					else if (zflag)
+						num = va_arg(ap, ssize_t);
+					else if (hflag)
+						num = static_cast<short>(va_arg(ap, int));
+					else if (cflag)
+						num = static_cast<char>(va_arg(ap, int));
+					else
+						num = va_arg(ap, int);
+				number:
+					if (sign && static_cast<intmax_t>(num) < 0)
+					{
+						neg = 1;
+						num = -static_cast<intmax_t>(num);
+					}
+					p   = ksprintn(nbuf, num, base, &n, upper);
+					tmp = 0;
+					if (sharpflag && num != 0)
+					{
+						if (base == 8)
+							tmp++;
+						else if (base == 16)
+							tmp += 2;
+					}
+					if (neg)
+						tmp++;
+
+					if (!ladjust && padc == '0')
+						dwidth = width - tmp;
+					width -= tmp + (dwidth > n ? dwidth : n);
+					dwidth -= n;
+					if (!ladjust)
+						while (width-- > 0)
+							putchar(' ');
+					if (neg)
+						putchar('-');
+					if (sharpflag && num != 0)
+					{
+						if (base == 8)
+						{
+							putchar('0');
+						}
+						else if (base == 16)
+						{
+							putchar('0');
+							putchar('x');
+						}
+					}
+					while (dwidth-- > 0)
+						putchar('0');
+
+					while (*p)
+						putchar(*p--);
+
+					if (ladjust)
+						while (width-- > 0)
+							putchar(' ');
+
+					break;
+				default:
+					while (percent < fmt)
+						putchar(*percent++);
+					/*
+					 * Since we ignore an formatting argument it is no
+					 * longer safe to obey the remaining formatting
+					 * arguments as the arguments will no longer match
+					 * the format specs.
+					 */
+					stop = 1;
+					break;
+			}
+		}
+	}
+
+} // namespace
+
+/*
+ * Scaled down version of vsnprintf(3).
+ */
+int __cheri_libcall
+vsnprintf(char *str, // NOLINT (clang-tidy spuriously thinks this should be
+                     // const, even though it's being written to)
+          size_t      size,
+          const char *format,
+          va_list     ap)
+{
+	struct Buffer
+	{
+		char  *str;
+		size_t remain;
+	} info        = {str, size};
+	auto callback = [&](int ch) {
+		if (info.remain >= 2)
+		{
+			*info.str++ = ch;
+			info.remain--;
+		}
+	};
+	int retval = kvprintf(format, callback, &info, 10, ap);
+	if (info.remain >= 1)
+		*info.str++ = '\0';
+	return (retval);
+}
+
+[[cheri::interrupt_state(disabled)]] int __cheri_libcall
+vfprintf(FILE *stream, const char *fmt, va_list ap)
+{
+	return kvprintf(
+	  fmt,
+	  [=](int ch) { static_cast<volatile Uart *>(stream)->blocking_write(ch); },
+	  nullptr,
+	  10,
+	  ap);
+}
+
+int __cheri_libcall snprintf(char *str, size_t size, const char *format, ...)
+{
+	int     rv;
+	va_list ap;
+
+	va_start(ap, format);
+	rv = vsnprintf(str, size, format, ap);
+	va_end(ap);
+
+	return rv;
+}
diff --git a/sdk/lib/stdio/xmake.lua b/sdk/lib/stdio/xmake.lua
index 01e39cc..2596268 100644
--- a/sdk/lib/stdio/xmake.lua
+++ b/sdk/lib/stdio/xmake.lua
@@ -1,4 +1,4 @@
 library("stdio")
   add_deps("string")
   set_default(false)
-  add_files("printf.c")
+  add_files("printf.cc")
diff --git a/sdk/third_party/freertos-plus-tcp b/sdk/third_party/freertos-plus-tcp
deleted file mode 160000
index d70a21c..0000000
--- a/sdk/third_party/freertos-plus-tcp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit d70a21ce03569118e030b348844477b92f5fd227
diff --git a/sdk/xmake.lua b/sdk/xmake.lua
index fb74db4..96b03cd 100644
--- a/sdk/xmake.lua
+++ b/sdk/xmake.lua
@@ -39,6 +39,18 @@
 debugOption("allocator")
 debugOption("token_library")
 
+function stackCheckOption(name)
+	option("stack-usage-check-" .. name)
+		set_default(false)
+		set_description("Enable dynamic stack usage checks in " .. name .. ". Do not enable this in debug builds!")
+		set_showmenu(true)
+		set_category("Debugging")
+	option_end()
+end
+
+stackCheckOption("allocator")
+stackCheckOption("scheduler")
+
 -- Force -Oz irrespective of build config.  At -O0, we blow out our stack and
 -- require much stronger alignment.
 set_optimize("Oz")
@@ -199,7 +211,7 @@
 -- having an allocator (or into providing a different allocator for a
 -- particular application)
 target("cheriot.allocator")
-	add_rules("cheriot.privileged-compartment", "cheriot.component-debug")
+	add_rules("cheriot.privileged-compartment", "cheriot.component-debug", "cheriot.component-stack-checks")
 	add_files(path.join(coredir, "allocator/main.cc"))
 	add_deps("locks")
 	add_deps("compartment_helpers")
@@ -715,6 +727,13 @@
 		target:add('defines', "DEBUG_" .. name:upper() .. "=" .. tostring(get_config("debug-"..name)))
 	end)
 
+-- Rule for conditionally enabling stack checks for a component.
+rule("cheriot.component-stack-checks")
+	after_load(function (target)
+		local name = target:get("cheriot.debug-name") or target:name()
+		target:add('options', "stack-usage-check-" .. name)
+		target:add('defines', "CHERIOT_STACK_CHECKS_" .. name:upper() .. "=" .. tostring(get_config("stack-usage-check-"..name)))
+	end)
 
 -- Build the loader.  The firmware rule will set the flags required for
 -- this to create threads.
diff --git a/tests/allocator-test.cc b/tests/allocator-test.cc
index dcc7191..c206f35 100644
--- a/tests/allocator-test.cc
+++ b/tests/allocator-test.cc
@@ -5,6 +5,7 @@
 #define TEST_NAME "Allocator"
 
 #include "tests.hh"
+#include <cheri.hh>
 #include <cheriot-atomic.hh>
 #include <cstdlib>
 #include <debug.hh>
@@ -72,7 +73,7 @@
 				Timeout t{AllocTimeout};
 				allocation = heap_allocate(&t, MALLOC_CAPABILITY, AllocSize);
 				TEST(
-				  allocation != nullptr,
+				  __builtin_cheri_tag_get(allocation),
 				  "Cannot make allocations anymore. Either the revoker is not "
 				  "working or it's too slow");
 			}
@@ -84,7 +85,7 @@
 			for (auto allocation : allocations)
 			{
 				TEST(
-				  __builtin_cheri_tag_get(allocation) == 0,
+				  !__builtin_cheri_tag_get(allocation),
 				  "tag for freed memory {} from allocation {} should be clear",
 				  allocation);
 			}
@@ -140,6 +141,8 @@
 			Timeout t{0};
 			allocation =
 			  heap_allocate(&noWait, MALLOC_CAPABILITY, BigAllocSize);
+			// here test for nullptr (as opposed to the valid tag
+			// bit) because we specifically want to check for OOM
 			if (allocation == nullptr)
 			{
 				memoryExhausted = true;
@@ -148,11 +151,13 @@
 		}
 		TEST(memoryExhausted, "Failed to exhaust memory");
 		debug_log("Trying a non-blocking allocation");
+		// nullptr check because we explicitly want to check for OOM
 		TEST(heap_allocate(&noWait, MALLOC_CAPABILITY, BigAllocSize) == nullptr,
 		     "Non-blocking heap allocation did not return failure with memory "
 		     "exhausted");
 		debug_log("Trying a huge allocation");
 		Timeout forever{UnlimitedTimeout};
+		// nullptr check because we explicitly want to check for OOM
 		TEST(heap_allocate(&forever, MALLOC_CAPABILITY, 1024 * 1024 * 1024) ==
 		       nullptr,
 		     "Non-blocking heap allocation did not return failure on huge "
@@ -165,7 +170,7 @@
 		debug_log("Entering blocking malloc");
 		Timeout t{AllocTimeout};
 		void   *ptr = heap_allocate(&t, MALLOC_CAPABILITY, BigAllocSize);
-		TEST(ptr != nullptr,
+		TEST(__builtin_cheri_tag_get(ptr),
 		     "Failed to make progress on blocking allocation, allocation "
 		     "returned {}",
 		     ptr);
@@ -190,13 +195,12 @@
 		auto                  t    = Timeout(0); /* don't sleep */
 
 		auto doAlloc = [&](size_t sz) {
-			auto p = heap_allocate(&t, MALLOC_CAPABILITY, sz);
+			CHERI::Capability p{heap_allocate(&t, MALLOC_CAPABILITY, sz)};
 
-			if (p != nullptr)
+			if (p.is_valid())
 			{
-				CHERI::Capability pwrap{p};
 				// dlmalloc can give you one granule more.
-				TEST(pwrap.length() == sz || pwrap.length() == sz + 8,
+				TEST(p.length() == sz || p.length() == sz + 8,
 				     "Bad return length");
 				memset(p, 0xCA, sz);
 				allocations.push_back(p);
@@ -257,20 +261,20 @@
 	void test_claims()
 	{
 		debug_log("Beginning tests on claims");
-		size_t quotaLeft = heap_quota_remaining(MALLOC_CAPABILITY);
+		auto quotaLeft = heap_quota_remaining(MALLOC_CAPABILITY);
 		TEST(quotaLeft == MALLOC_QUOTA,
 		     "After claim and free from {}-byte quota, {} bytes left before "
 		     "running claims tests",
 		     MALLOC_QUOTA,
 		     quotaLeft);
 		size_t allocSize       = 128;
-		size_t mallocQuotaLeft = heap_quota_remaining(MALLOC_CAPABILITY);
+		auto   mallocQuotaLeft = heap_quota_remaining(MALLOC_CAPABILITY);
 		CHERI::Capability alloc{
 		  heap_allocate(&noWait, MALLOC_CAPABILITY, allocSize)};
 		TEST(alloc.is_valid(), "Allocation failed");
 		int  claimCount = 0;
 		auto claim      = [&]() {
-            size_t claimSize = heap_claim(SECOND_HEAP, alloc);
+            ssize_t claimSize = heap_claim(SECOND_HEAP, alloc);
             claimCount++;
             TEST(claimSize == allocSize,
 			          "{}-byte allocation claimed as {} bytes (claim number {})",
@@ -289,7 +293,7 @@
 		claim();
 		quotaLeft = heap_quota_remaining(SECOND_HEAP);
 		claim();
-		size_t quotaLeftAfterSecondClaim = heap_quota_remaining(SECOND_HEAP);
+		auto quotaLeftAfterSecondClaim = heap_quota_remaining(SECOND_HEAP);
 		TEST(quotaLeft == quotaLeftAfterSecondClaim,
 		     "Claiming twice reduced quota from {} to {}",
 		     quotaLeft,
@@ -297,7 +301,7 @@
 		debug_log("Freeing object on malloc capability: {}", alloc);
 		ret = heap_free(MALLOC_CAPABILITY, alloc);
 		TEST(ret == 0, "Failed to free claimed object, return: {}", ret);
-		size_t mallocQuota2 = heap_quota_remaining(MALLOC_CAPABILITY);
+		auto mallocQuota2 = heap_quota_remaining(MALLOC_CAPABILITY);
 		TEST(mallocQuotaLeft == mallocQuota2,
 		     "Freeing claimed object did not restore quota to {}, quota is {}",
 		     mallocQuotaLeft,
@@ -328,9 +332,10 @@
 		for (size_t i = 16; i < 256; i <<= 1)
 		{
 			allocated += i;
-			TEST(heap_allocate(&noWait, SECOND_HEAP, i) != nullptr,
-			     "Allocating {} bytes failed",
-			     i);
+			TEST(
+			  __builtin_cheri_tag_get(heap_allocate(&noWait, SECOND_HEAP, i)),
+			  "Allocating {} bytes failed",
+			  i);
 		}
 		debug_log("Quota left after allocating {} bytes: {}",
 		          allocated,
@@ -354,8 +359,10 @@
 		debug_log("Before allocating, quota left: {}",
 		          heap_quota_remaining(SECOND_HEAP));
 		Timeout longTimeout{1000};
-		void   *ptr  = heap_allocate(&longTimeout, SECOND_HEAP, 16);
-		void   *ptr2 = heap_allocate(&longTimeout, SECOND_HEAP, 16);
+		void   *ptr = heap_allocate(&longTimeout, SECOND_HEAP, 16);
+		TEST(__builtin_cheri_tag_get(ptr), "Failed to allocate 16 bytes");
+		void *ptr2 = heap_allocate(&longTimeout, SECOND_HEAP, 16);
+		TEST(__builtin_cheri_tag_get(ptr2), "Failed to allocate 16 bytes");
 		debug_log("After allocating, quota left: {}",
 		          heap_quota_remaining(SECOND_HEAP));
 		static cheriot::atomic<int> state = 0;
@@ -404,7 +411,8 @@
 		// The next test requires all memory allocated from the malloc
 		// capability to be freed before it starts.
 		int sleeps = 0;
-		while (heap_quota_remaining(MALLOC_CAPABILITY) < MALLOC_QUOTA)
+		while (heap_quota_remaining(MALLOC_CAPABILITY) < MALLOC_QUOTA &&
+		       heap_quota_remaining(MALLOC_CAPABILITY) > 0)
 		{
 			Timeout t{1};
 			thread_sleep(&t);
@@ -518,7 +526,7 @@
 	Timeout t{5};
 	test_free_all();
 	void *ptr = heap_allocate(&t, STATIC_SEALED_VALUE(secondHeap), 32);
-	TEST(ptr, "Failed to allocate 32 bytes");
+	TEST(__builtin_cheri_tag_get(ptr), "Failed to allocate 32 bytes");
 	TEST(heap_address_is_valid(ptr) == true,
 	     "Heap object incorrectly reported as not heap address");
 	int ret = heap_free(MALLOC_CAPABILITY, ptr);
@@ -531,7 +539,7 @@
 	TEST(ret == 0,
 	     "Heap free with the correct capability returned failed with {}.",
 	     ret);
-	size_t quotaLeft = heap_quota_remaining(STATIC_SEALED_VALUE(secondHeap));
+	auto quotaLeft = heap_quota_remaining(STATIC_SEALED_VALUE(secondHeap));
 	TEST(quotaLeft == 1024,
 	     "After alloc and free from 1024-byte quota, {} bytes left",
 	     quotaLeft);
@@ -542,7 +550,20 @@
 	TEST(heap_address_is_valid(&noWait) == false,
 	     "Global object incorrectly reported as heap address");
 
+	t = 5;
+	Capability array{heap_allocate_array(&t, MALLOC_CAPABILITY, 0x80000004, 2)};
+	TEST(
+	  !array.is_valid(), "Allocating too large an array succeeded: {}", array);
+	array = heap_allocate_array(&t, MALLOC_CAPABILITY, 16, 2);
+	TEST(array.is_valid(), "Allocating array failed: {}", array);
+	TEST(array.length() == 32,
+	     "Allocating array returned incorrect length: {}",
+	     array);
+	ret = heap_free(MALLOC_CAPABILITY, array);
+	TEST(ret == 0, "Freeing array failed: {}", ret);
+
 	test_blocking_allocator();
+	heap_quarantine_empty();
 	test_revoke();
 	test_fuzz();
 	allocations.clear();
diff --git a/tests/debug-test.c b/tests/debug-test.c
new file mode 100644
index 0000000..03c008c
--- /dev/null
+++ b/tests/debug-test.c
@@ -0,0 +1,25 @@
+#include <compartment.h>
+#include <debug.h>
+
+__cheri_compartment("debug_test") void test_debug_c()
+{
+	unsigned char x = 'c';
+	_Bool         t = true;
+	CHERIOT_DEBUG_LOG(
+	  "Debug messages",
+	  "Testing C debug log: 42:{}, true:{}, hello world:{}, "
+	  "'c':{}, &x:{}, NULL:{}, short 3:{}, unsigned short 0xf:{}",
+	  42,
+	  t,
+	  "hello world",
+	  (char)'c',
+	  &x,
+	  NULL,
+	  (short)3,
+	  (unsigned short)0xf);
+	// Just test that these compile:
+	CHERIOT_INVARIANT(true, "Testing C++ invariant failure: 42:{}", 42);
+	CHERIOT_INVARIANT(true, "Testing C++ invariant failure");
+	CHERIOT_INVARIANT(
+	  true, "Testing C++ invariant failure: 42:{}", 42, 1, 3, 4, "oops");
+}
diff --git a/tests/debug-test.cc b/tests/debug-test.cc
new file mode 100644
index 0000000..e44bb65
--- /dev/null
+++ b/tests/debug-test.cc
@@ -0,0 +1,22 @@
+#include "tests.hh"
+#include <compartment.h>
+#include <debug.h>
+
+void test_debug_cxx()
+{
+	unsigned char x = 'c';
+	CHERIOT_DEBUG_LOG("Debug messages",
+	                  "Testing C++ debug log: 42:{}, true:{}, hello world:{}, "
+	                  "'c':{}, &x:{}, nullptr:{}",
+	                  42,
+	                  true,
+	                  "hello world",
+	                  'c',
+	                  &x,
+	                  nullptr);
+	// Just test that these compile:
+	CHERIOT_INVARIANT(true, "Testing C++ invariant failure: 42:{}", 42);
+	CHERIOT_INVARIANT(true, "Testing C++ invariant failure");
+	CHERIOT_INVARIANT(
+	  true, "Testing C++ invariant failure: 42:{}", 42, 1, 3, 4, "oops");
+}
diff --git a/tests/locks-test.cc b/tests/locks-test.cc
index 8401b0d..ffd8da2 100644
--- a/tests/locks-test.cc
+++ b/tests/locks-test.cc
@@ -23,14 +23,12 @@
 	cheriot::atomic<int>  counter;
 
 	/**
-	 * Test that a lock meets the minimum requirements: it actually provides
-	 * mutual exclusion.
+	 * Test that a lock actually provides mutual exclusion.
 	 */
 	template<typename Lock>
 	void test_lock(Lock &lock)
 	{
 		modified = false;
-		debug_log("Acquiring lock in {}", __PRETTY_FUNCTION__);
 		{
 			LockGuard g{lock};
 			async([&]() {
@@ -83,7 +81,7 @@
 			// making progress if this test fails.
 			// The generous timer makes sure that we reach the
 			// modified == true assert before the timeout.
-			Timeout t2{20};
+			Timeout t2{25};
 			TEST(lock.try_lock(&t2) == false,
 			     "Lock acquisition should not succeed!");
 
@@ -99,7 +97,7 @@
 		lock.upgrade_for_destruction();
 
 		// Give the waiter a chance to wake up
-		sleep(1);
+		sleep(5);
 
 		// Check that the destruction mode woke up the waiters
 		TEST(modified == true, "Destruction mode did not wake up waiters!");
@@ -113,6 +111,39 @@
 	}
 
 	/**
+	 * Test that unlocking a flag lock works: once a lock is released, it
+	 * can be re-acquired. This will fail if the unlock function does not
+	 * properly clear bits.
+	 */
+	void test_flaglock_unlock()
+	{
+		// Check for the case where a lock was released with the
+		// waiters bit set. Do not check the simple case without
+		// waiters as this is covered by other tests (at least
+		// `test_lock`).
+		Timeout t{5};
+		TEST(flagLock.try_lock(&t), "Failed to acquire uncontended lock");
+		counter = 0;
+		async([&]() {
+			Timeout t2{1};
+			TEST(flagLock.try_lock(&t2) == false,
+			     "Lock acquisition should not succeed!");
+			counter++;
+		});
+		do
+		{
+			debug_log("Other thread not finished, yielding");
+			sleep(1);
+		} while (counter.load() == 0);
+		flagLock.unlock();
+
+		TEST(flagLock.try_lock(&t),
+		     "Failed to acquire uncontended lock after unlock when the waiters "
+		     "bit was previously set");
+		flagLock.unlock();
+	}
+
+	/**
 	 * Test that a lock with the destruction bit set cannot be acquired
 	 * anymore.
 	 *
@@ -123,6 +154,36 @@
 		static FlagLockState flagLockState;
 		static FlagLockState priorityFlagLockState;
 
+		// Upgrade the lock to destruction mode. No need to acquire the
+		// lock for that.
+		flaglock_upgrade_for_destruction(&flagLockState);
+
+		// Check that we now fail to grab the lock with the right error
+		Timeout t{5};
+		TEST(flaglock_trylock(&t, &flagLockState) == -ENOENT,
+		     "Acquiring the lock did not fail with -ENOENT although it is in "
+		     "destruction mode");
+
+		// Now, do the same tests with the priority inheriting flag lock
+		flaglock_upgrade_for_destruction(&priorityFlagLockState);
+
+		TEST(flaglock_priority_inheriting_trylock(&t, &priorityFlagLockState) ==
+		       -ENOENT,
+		     "Acquiring the lock did not fail with -ENOENT although it is in "
+		     "destruction mode");
+	}
+
+	/**
+	 * Test that the destruction bit is preserved when unlocking.
+	 *
+	 * Note: here, plug at the C API to be able to check C error codes.
+	 */
+	void test_destruct_flag_lock_unlock()
+	{
+		// Only test without priority inheriting for code size reasons,
+		// but both should behave identically
+		static FlagLockState flagLockState;
+
 		Timeout t{5};
 		int     ret = flaglock_trylock(&t, &flagLockState);
 		TEST(ret == 0, "Flag lock trylock failed with error {}", ret);
@@ -130,23 +191,13 @@
 		// Upgrade the lock to destruction mode
 		flaglock_upgrade_for_destruction(&flagLockState);
 
-		// Check that we now fail to grab the lock with the right error
+		// Now unlock the lock
+		flaglock_unlock(&flagLockState);
+
+		// Check that the destruction bit is still set (by trying to
+		// grab the lock again - it should fail with -ENOENT)
 		TEST(flaglock_trylock(&t, &flagLockState) == -ENOENT,
-		     "Acquiring the lock did not fail with -ENOENT although it is in "
-		     "destruction mode");
-
-		// Now, do the same tests with the priority inheriting flag lock
-		ret = flaglock_priority_inheriting_trylock(&t, &priorityFlagLockState);
-		TEST(ret == 0,
-		     "Priority inheriting flag lock trylock failed with error {}",
-		     ret);
-
-		flaglock_upgrade_for_destruction(&priorityFlagLockState);
-
-		TEST(flaglock_priority_inheriting_trylock(&t, &priorityFlagLockState) ==
-		       -ENOENT,
-		     "Acquiring the lock did not fail with -ENOENT although it is in "
-		     "destruction mode");
+		     "Unlocking unsets the destruction bit of flag lock");
 	}
 
 	void test_recursive_mutex()
@@ -211,6 +262,7 @@
 	 */
 	void test_ticket_lock_ordering()
 	{
+		counter = 0;
 		debug_log("Starting ticket-lock ordering tests");
 		{
 			LockGuard g{ticketLock};
@@ -311,11 +363,13 @@
 	test_lock(flagLock);
 	test_lock(flagLockPriorityInherited);
 	test_lock(ticketLock);
+	test_flaglock_unlock();
 	test_trylock(flagLock);
 	test_trylock(flagLockPriorityInherited);
 	test_destruct_lock_wake_up(flagLock);
 	test_destruct_lock_wake_up(flagLockPriorityInherited);
 	test_destruct_flag_lock_acquire();
+	test_destruct_flag_lock_unlock();
 	test_ticket_lock_ordering();
 	test_ticket_lock_overflow();
 	test_recursive_mutex();
diff --git a/tests/queue-test.cc b/tests/queue-test.cc
index f6e3aa6..5239f19 100644
--- a/tests/queue-test.cc
+++ b/tests/queue-test.cc
@@ -6,6 +6,7 @@
 #include <cstdlib>
 #define TEST_NAME "Queue"
 #include "tests.hh"
+#include <FreeRTOS-Compat/queue.h>
 #include <debug.hh>
 #include <errno.h>
 #include <queue.h>
@@ -96,14 +97,14 @@
 	checkSpace(1);
 	queue_receive(&timeout, &queue, bytes);
 	checkSpace(0);
-	rv = heap_free(MALLOC_CAPABILITY, queueMemory);
+	rv = queue_destroy(MALLOC_CAPABILITY, &queue);
 	TEST(rv == 0, "Queue deletion failed with {}", rv);
 	debug_log("All queue library tests successful");
 }
 
 void test_queue_sealed()
 {
-	size_t  heapSpace = heap_quota_remaining(MALLOC_CAPABILITY);
+	auto    heapSpace = heap_quota_remaining(MALLOC_CAPABILITY);
 	Timeout t{1};
 	SObj    receiveHandle;
 	SObj    sendHandle;
@@ -169,9 +170,25 @@
 	debug_log("All queue compartment tests successful");
 }
 
+void test_queue_freertos()
+{
+	debug_log("Testing FreeRTOS queues");
+	auto quotaBegin    = heap_quota_remaining(MALLOC_CAPABILITY);
+	auto freertosQueue = xQueueCreate(10, sizeof(int));
+	vQueueDelete(freertosQueue);
+	auto quotaEnd = heap_quota_remaining(MALLOC_CAPABILITY);
+	TEST(
+	  quotaBegin == quotaEnd,
+	  "The FreeRTOS queue wrapper leaks memory: quota before is {}, after {}",
+	  quotaBegin,
+	  quotaEnd);
+	debug_log("All FreeRTOS queue tests successful");
+}
+
 void test_queue()
 {
 	test_queue_unsealed();
 	test_queue_sealed();
+	test_queue_freertos();
 	debug_log("All queue tests successful");
 }
diff --git a/tests/stack-test.cc b/tests/stack-test.cc
index b51ea6b..71ebd27 100644
--- a/tests/stack-test.cc
+++ b/tests/stack-test.cc
@@ -72,8 +72,45 @@
 		debug_log("Expected to invoke the handler? {}", handlerExpected);
 		set_expected_behaviour(&threadStackTestFailed, handlerExpected);
 	}
+
+	__attribute__((used)) extern "C" int test_small_stack()
+	{
+		return test_stack_requirement();
+	}
 } // namespace
 
+__attribute__((used)) extern "C" int test_with_small_stack(size_t stackSize);
+
+asm(".section .text\n"
+    ".global test_with_small_stack\n"
+    "test_with_small_stack:\n"
+    // Preserve the old stack
+    "  cmove ct0, csp\n"
+    // Add space to the requested size for the spill slots and the size of the
+    // stack that `test_small_stack` will use, plus the four capabilities that
+    // the switcher will spill for us.  We use 16 bytes to store the stack
+    // pointer and return address, `test_small_stack` uses the same amount: it
+    // needs to store the return address, and the ABI requires that stacks are
+    // 16-byte aligned and so it uses a full 16 bytes.
+    "  add a0, a0, 64\n"
+    // Get the base of the stack
+    "  cgetbase a2, csp\n"
+    // Move the stack pointer to the current base
+    "  csetaddr csp, csp, a2\n"
+    // Truncate the stack
+    "  csetbounds csp, csp, a0\n"
+    // Move to the end of the stack, minus the spill-slot size
+    "  cincoffset csp, csp, a0\n"
+    "  cincoffset csp, csp, -16\n"
+    "  csc  ct0, 0(csp)\n"
+    "  csc  cra, 8(csp)\n"
+    // Call the test function
+    "  cjal test_small_stack\n"
+    // Restore
+    "  clc  cra, 8(csp)\n"
+    "  clc  csp, 0(csp)\n"
+    "  cjr cra\n");
+
 // Defeat the compiler optimisation that may turn our first call to this into a
 // call. If the compiler does this then we will fail on an even number of
 // cross-compartment calls not an odd number.
@@ -90,6 +127,18 @@
  */
 void test_stack()
 {
+	int ret = test_with_small_stack(128);
+	TEST(ret == 0,
+	     "test_with_small_stack failed, returned {} with 128-byte stack",
+	     ret);
+	ret = test_with_small_stack(144);
+	TEST(ret == 0,
+	     "test_with_small_stack failed, returned {} with 144-byte stack",
+	     ret);
+	ret = test_with_small_stack(112);
+	TEST(ret == -ENOTENOUGHSTACK,
+	     "test_with_small_stack failed, returned {} with 112-byte stack",
+	     ret);
 	__cheri_callback void (*callback)() = cross_compartment_call;
 
 	crossCompartmentCall = test_trusted_stack_exhaustion;
@@ -127,5 +176,6 @@
 
 	debug_log("invalid stack on cross compartment call");
 	expect_handler(false);
+
 	test_stack_invalid_on_call(callback);
 }
diff --git a/tests/stack_integrity_thread.cc b/tests/stack_integrity_thread.cc
index dad3f42..7a629c5 100644
--- a/tests/stack_integrity_thread.cc
+++ b/tests/stack_integrity_thread.cc
@@ -139,3 +139,8 @@
 {
 	self_recursion(fn);
 }
+
+int test_stack_requirement()
+{
+	return 0;
+}
diff --git a/tests/stack_tests.h b/tests/stack_tests.h
index bfbe74b..fbc757b 100644
--- a/tests/stack_tests.h
+++ b/tests/stack_tests.h
@@ -52,3 +52,6 @@
 
 	return false;
 }
+
+__cheri_compartment("stack_integrity_thread")
+  __cheriot_minimum_stack(128) int test_stack_requirement();
diff --git a/tests/stdio-test.cc b/tests/stdio-test.cc
new file mode 100644
index 0000000..a23aa0b
--- /dev/null
+++ b/tests/stdio-test.cc
@@ -0,0 +1,22 @@
+#include <cstdio>
+#define TEST_NAME "stdio"
+#include "tests.hh"
+#include <stdio.h>
+
+void test_stdio()
+{
+	debug_log("Printing 'Hello, world!' to stdout");
+	printf("Hello, world!\n");
+	debug_log("Printing 'Hello, world!' to stderr");
+	fprintf(stderr, "Hello, world!\n");
+	const size_t BufferSize = 64;
+	char         buffer[BufferSize];
+	snprintf(buffer, BufferSize, "%d", 42);
+	TEST(strcmp(buffer, "42") == 0,
+	     "snprintf(\"%d\", 42) gave {}",
+	     std::string_view{buffer, BufferSize});
+	snprintf(buffer, BufferSize, "%d", -42);
+	TEST(strcmp(buffer, "-42") == 0,
+	     "snprintf(\"%d\", -42) gave {}",
+	     std::string_view{buffer, BufferSize});
+}
diff --git a/tests/test-runner.cc b/tests/test-runner.cc
index 9d1452d..8d99fdd 100644
--- a/tests/test-runner.cc
+++ b/tests/test-runner.cc
@@ -52,6 +52,9 @@
 #endif
 		return ErrorRecoveryBehaviour::ForceUnwind;
 	}
+	debug_log("mcause: {}, pcc: {}", mcause, frame->pcc);
+	auto [reg, cause] = CHERI::extract_cheri_mtval(mtval);
+	debug_log("Error {} in register {}", reg, cause);
 	debug_log("Current test crashed");
 	crashDetected = true;
 	return ErrorRecoveryBehaviour::InstallContext;
@@ -104,7 +107,10 @@
 	debug_log("Trying to print string: {}", std::string_view{testString, 13});
 
 	run_timed("All tests", []() {
+		run_timed("Debug helpers (C++)", test_debug_cxx);
+		run_timed("Debug helpers (C)", test_debug_c);
 		run_timed("MMIO", test_mmio);
+		run_timed("stdio", test_stdio);
 		run_timed("Static sealing", test_static_sealing);
 		run_timed("Crash recovery", test_crash_recovery);
 		run_timed("Compartment calls", test_compartment_call);
diff --git a/tests/tests.hh b/tests/tests.hh
index 4fec070..bd63a3c 100644
--- a/tests/tests.hh
+++ b/tests/tests.hh
@@ -19,6 +19,9 @@
 __cheri_compartment("check_pointer_test") void test_check_pointer();
 __cheri_compartment("misc_test") void test_misc();
 __cheri_compartment("static_sealing_test") void test_static_sealing();
+__cheri_compartment("stdio_test") void test_stdio();
+__cheri_compartment("debug_test") void test_debug_cxx();
+__cheri_compartment("debug_test") void test_debug_c();
 
 // Simple tests don't need a separate compartment.
 void test_global_constructors();
diff --git a/tests/xmake.lua b/tests/xmake.lua
index 4cb7213..2f9e5ed 100644
--- a/tests/xmake.lua
+++ b/tests/xmake.lua
@@ -49,9 +49,14 @@
 -- Test the futex implementation
 test("futex")
 -- Test locks built on top of the futex
-test("queue")
--- Test queues
 test("locks")
+-- Test queues
+test("queue")
+-- Test minimal stdio implementation
+test("stdio")
+-- Test the debug helpers.
+test("debug")
+	add_files("debug-test.c")
 -- Test the static sealing types
 test("static_sealing")
 compartment("static_sealing_inner")
@@ -92,6 +97,7 @@
     -- Helper libraries
     add_deps("freestanding", "string", "crt", "cxxrt", "atomic_fixed", "compartment_helpers", "debug")
     add_deps("message_queue", "locks", "event_group")
+    add_deps("stdio")
     -- Tests
     add_deps("mmio_test")
     add_deps("eventgroup_test")
@@ -108,6 +114,8 @@
     add_deps("compartment_calls_test", "compartment_calls_inner", "compartment_calls_inner_with_handler")
     add_deps("check_pointer_test")
     add_deps("misc_test")
+    add_deps("stdio_test")
+    add_deps("debug_test")
     -- Set the thread entry point to the test runner.
     on_load(function(target)
         target:values_set("board", "$(board)")