build_tools/scripts/unicode_tables_gen.py - 3p/openxla/iree - Git at Google

 #!/usr/bin/env python3
 # Copyright 2025 The IREE Authors
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 """Generate Unicode lookup tables for IREE's unicode utilities.

 This script parses the Unicode Character Database (UCD) and generates
 the unicode_tables.c file with category ranges, whitespace codepoints,
 case mappings, and NFD decomposition tables.

 Usage:
   # Generate tables (downloads UCD if needed):
   python unicode_tables_gen.py

   # Check that existing tables match what would be generated:
   python unicode_tables_gen.py --check

   # Use a specific UCD version:
   python unicode_tables_gen.py --unicode-version 15.1.0

   # Use local UCD files:
   python unicode_tables_gen.py --ucd-dir /path/to/ucd
 """

 import argparse
 import os
 import sys
 import urllib.request
 from pathlib import Path

 # Unicode Character Database URL template.
 UCD_URL_TEMPLATE = "https://www.unicode.org/Public/{version}/ucd/{file}"

 # Default Unicode version.
 DEFAULT_UNICODE_VERSION = "15.1.0"

 # Output file path (relative to repo root).
 OUTPUT_PATH = "runtime/src/iree/base/internal/unicode_tables.c"

 # Category flag values (must match unicode.h).
 CATEGORY_FLAGS = {
     "L": "CAT_L",  # Letter
     "M": "CAT_M",  # Mark
     "N": "CAT_N",  # Number
     "P": "CAT_P",  # Punctuation
     "S": "CAT_S",  # Symbol
     "Z": "CAT_Z",  # Separator
     "C": "CAT_C",  # Other
 }


 def download_ucd_file(version: str, filename: str, cache_dir: Path) -> Path:
     """Download a UCD file if not cached."""
     cache_path = cache_dir / version / filename
     if cache_path.exists():
         return cache_path

     url = UCD_URL_TEMPLATE.format(version=version, file=filename)
     print(f"Downloading {url}...")
     cache_path.parent.mkdir(parents=True, exist_ok=True)

     try:
         urllib.request.urlretrieve(url, cache_path)
     except Exception as e:
         print(f"Error downloading {url}: {e}", file=sys.stderr)
         sys.exit(1)

     return cache_path


 def parse_unicode_data(path: Path) -> dict:
     """Parse UnicodeData.txt to extract categories and case mappings."""
     data = {
         "categories": {},  # codepoint -> category
         "uppercase": {},  # codepoint -> uppercase mapping
         "lowercase": {},  # codepoint -> lowercase mapping
     }

     with open(path, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
             if not line or line.startswith("#"):
                 continue

             fields = line.split(";")
             if len(fields) < 15:
                 continue

             codepoint = int(fields[0], 16)
             name = fields[1]
             category = fields[2]
             uppercase = fields[12]
             lowercase = fields[13]

             # Handle ranges (e.g., "<CJK Ideograph, First>").
             if name.endswith(", First>"):
                 range_start = codepoint
                 continue
             elif name.endswith(", Last>"):
                 range_end = codepoint
                 for cp in range(range_start, range_end + 1):
                     data["categories"][cp] = category
                 continue

             data["categories"][codepoint] = category

             if uppercase:
                 data["uppercase"][codepoint] = int(uppercase, 16)
             if lowercase:
                 data["lowercase"][codepoint] = int(lowercase, 16)

     return data


 def parse_prop_list(path: Path) -> set:
     """Parse PropList.txt to extract White_Space property."""
     whitespace = set()

     with open(path, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
             if not line or line.startswith("#"):
                 continue

             # Remove comments.
             if "#" in line:
                 line = line[: line.index("#")].strip()

             parts = line.split(";")
             if len(parts) < 2:
                 continue

             codepoint_range = parts[0].strip()
             prop = parts[1].strip()

             if prop != "White_Space":
                 continue

             if ".." in codepoint_range:
                 start, end = codepoint_range.split("..")
                 for cp in range(int(start, 16), int(end, 16) + 1):
                     whitespace.add(cp)
             else:
                 whitespace.add(int(codepoint_range, 16))

     return whitespace


 def parse_unicode_data_for_decomposition(path: Path, categories: dict) -> dict:
     """Parse UnicodeData.txt to extract NFD decompositions (simple 1:1 only).

     Args:
         path: Path to UnicodeData.txt
         categories: Dict mapping codepoint -> category (from parse_unicode_data)
     """
     nfd = {}

     with open(path, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
             if not line or line.startswith("#"):
                 continue

             fields = line.split(";")
             if len(fields) < 6:
                 continue

             codepoint = int(fields[0], 16)
             decomposition = fields[5].strip()

             if not decomposition:
                 continue

             # Skip compatibility decompositions (start with <tag>).
             if decomposition.startswith("<"):
                 continue

             # Parse canonical decomposition.
             parts = decomposition.split()
             if len(parts) >= 1:
                 # Take the first codepoint as the base character.
                 base = int(parts[0], 16)
                 # Only include if it's a simple 1:1 mapping to a base character.
                 # Skip if the base is a combining mark (category M: Mn, Mc, Me).
                 base_category = categories.get(base, "")
                 if not base_category.startswith("M"):
                     nfd[codepoint] = base

     return nfd


 def build_category_ranges(categories: dict) -> list:
     """Build compact category ranges from individual codepoint categories."""
     # Group by major category (first letter).
     by_major = {}
     for cp, cat in sorted(categories.items()):
         major = cat[0] if cat else "C"
         if major not in by_major:
             by_major[major] = []
         by_major[major].append(cp)

     # Build ranges.
     ranges = []
     for major, codepoints in by_major.items():
         codepoints = sorted(set(codepoints))
         if not codepoints:
             continue

         # Merge consecutive codepoints into ranges.
         start = codepoints[0]
         end = start
         for cp in codepoints[1:]:
             if cp == end + 1:
                 end = cp
             else:
                 ranges.append((start, end, major))
                 start = cp
                 end = cp
         ranges.append((start, end, major))

     # Sort by start codepoint.
     ranges.sort(key=lambda r: r[0])

     # Filter to only include non-ASCII ranges (ASCII is handled inline).
     ranges = [(s, e, c) for s, e, c in ranges if s >= 0x80]

     return ranges


 def hex_compact(value: int) -> str:
     """Format a hex value compactly (no unnecessary leading zeros, 0 for zero)."""
     if value == 0:
         return "0"
     return f"0x{value:X}"


 def pack_entries(
     entries: list, indent: str = "    ", max_line_length: int = 100
 ) -> list:
     """Pack entries into lines up to max_line_length."""
     lines = []
     current_line = indent
     for entry in entries:
         if len(current_line) + len(entry) + 1 > max_line_length:
             lines.append(current_line.rstrip())
             current_line = indent
         current_line += entry + " "
     if current_line.strip():
         lines.append(current_line.rstrip())
     return lines


 def generate_tables_c(
     version: str,
     category_ranges: list,
     whitespace: set,
     case_mappings: dict,
     nfd_mappings: dict,
 ) -> str:
     """Generate the unicode_tables.c file content."""
     lines = []

     # Header with prominent disclaimer and clang-format off.
     lines.append(
         """\
 // Copyright 2025 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 // clang-format off

 //===----------------------------------------------------------------------===//
 // **********************    DO NOT EDIT THIS FILE    **********************
 //===----------------------------------------------------------------------===//
 //
 // Unicode lookup tables generated from the Unicode Character Database.
 //
 // Generated by: build_tools/scripts/unicode_tables_gen.py
 // Unicode version: {version}
 //
 // To regenerate:
 //   python3 build_tools/scripts/unicode_tables_gen.py
 //
 //===----------------------------------------------------------------------===//

 #include "iree/base/internal/unicode.h"

 //===----------------------------------------------------------------------===//
 // Category range table
 //===----------------------------------------------------------------------===//

 #define L (1 << 0)
 #define M (1 << 1)
 #define N (1 << 2)
 #define P (1 << 3)
 #define S (1 << 4)
 #define Z (1 << 5)
 #define C (1 << 6)

 const iree_unicode_category_range_t iree_unicode_category_ranges[] = {{""".format(
             version=version
         )
     )

     # Category ranges - packed.
     cat_entries = []
     for start, end, category in category_ranges:
         cat_entries.append(f"{{{hex_compact(start)},{hex_compact(end)},{category}}},")
     lines.extend(pack_entries(cat_entries))

     lines.append(
         """\
 };
 #undef L
 #undef M
 #undef N
 #undef P
 #undef S
 #undef Z
 #undef C

 const iree_host_size_t iree_unicode_category_ranges_count =
     sizeof(iree_unicode_category_ranges) / sizeof(iree_unicode_category_ranges[0]);

 //===----------------------------------------------------------------------===//
 // Whitespace codepoints (White_Space property)
 //===----------------------------------------------------------------------===//

 const uint32_t iree_unicode_whitespace_codepoints[] = {"""
     )

     # Whitespace (non-ASCII only, ASCII is handled inline) - packed.
     ws_entries = [f"{hex_compact(cp)}," for cp in sorted(whitespace) if cp >= 0x80]
     lines.extend(pack_entries(ws_entries))

     lines.append(
         """\
 };

 const iree_host_size_t iree_unicode_whitespace_count =
     sizeof(iree_unicode_whitespace_codepoints) / sizeof(iree_unicode_whitespace_codepoints[0]);

 //===----------------------------------------------------------------------===//
 // Case mapping table
 //===----------------------------------------------------------------------===//

 const iree_unicode_case_mapping_t iree_unicode_case_mappings[] = {"""
     )

     # Case mappings (non-ASCII only) - packed.
     case_codepoints = sorted(
         set(case_mappings["uppercase"].keys()) | set(case_mappings["lowercase"].keys())
     )
     case_entries = []
     for cp in case_codepoints:
         if cp < 0x80:
             continue
         lower = case_mappings["lowercase"].get(cp, 0)
         upper = case_mappings["uppercase"].get(cp, 0)
         case_entries.append(
             f"{{{hex_compact(cp)},{hex_compact(lower)},{hex_compact(upper)}}},"
         )
     lines.extend(pack_entries(case_entries))

     lines.append(
         """\
 };

 const iree_host_size_t iree_unicode_case_mappings_count =
     sizeof(iree_unicode_case_mappings) / sizeof(iree_unicode_case_mappings[0]);

 //===----------------------------------------------------------------------===//
 // NFD base character table (simple 1:1 decomposition)
 //===----------------------------------------------------------------------===//

 const iree_unicode_nfd_mapping_t iree_unicode_nfd_mappings[] = {"""
     )

     # NFD mappings - packed.
     nfd_entries = []
     for cp in sorted(nfd_mappings.keys()):
         base = nfd_mappings[cp]
         nfd_entries.append(f"{{{hex_compact(cp)},{hex_compact(base)}}},")
     lines.extend(pack_entries(nfd_entries))

     lines.append(
         """\
 };

 const iree_host_size_t iree_unicode_nfd_mappings_count =
     sizeof(iree_unicode_nfd_mappings) / sizeof(iree_unicode_nfd_mappings[0]);
 """
     )

     return "\n".join(lines)


 def find_repo_root() -> Path:
     """Find the IREE repository root."""
     current = Path(__file__).resolve()
     while current != current.parent:
         if (current / "runtime" / "src" / "iree").exists():
             return current
         current = current.parent
     print("Error: Could not find IREE repository root", file=sys.stderr)
     sys.exit(1)


 def main():
     parser = argparse.ArgumentParser(description="Generate Unicode lookup tables")
     parser.add_argument(
         "--unicode-version",
         default=DEFAULT_UNICODE_VERSION,
         help=f"Unicode version (default: {DEFAULT_UNICODE_VERSION})",
     )
     parser.add_argument(
         "--ucd-dir",
         type=Path,
         help="Path to local UCD directory (downloads if not specified)",
     )
     parser.add_argument(
         "--check",
         action="store_true",
         help="Check that existing tables match what would be generated",
     )
     parser.add_argument(
         "--output",
         type=Path,
         help="Output file path (default: auto-detect from repo root)",
     )
     args = parser.parse_args()

     repo_root = find_repo_root()
     output_path = args.output or repo_root / OUTPUT_PATH

     # Determine UCD file locations.
     if args.ucd_dir:
         ucd_dir = args.ucd_dir
         unicode_data_path = ucd_dir / "UnicodeData.txt"
         prop_list_path = ucd_dir / "PropList.txt"
     else:
         cache_dir = Path.home() / ".cache" / "iree" / "ucd"
         unicode_data_path = download_ucd_file(
             args.unicode_version, "UnicodeData.txt", cache_dir
         )
         prop_list_path = download_ucd_file(
             args.unicode_version, "PropList.txt", cache_dir
         )

     print(f"Parsing UnicodeData.txt...")
     unicode_data = parse_unicode_data(unicode_data_path)

     print(f"Parsing PropList.txt...")
     whitespace = parse_prop_list(prop_list_path)

     print(f"Parsing NFD decompositions...")
     nfd_mappings = parse_unicode_data_for_decomposition(
         unicode_data_path, unicode_data["categories"]
     )

     print(f"Building category ranges...")
     category_ranges = build_category_ranges(unicode_data["categories"])

     print(f"Generating tables...")
     case_mappings = {
         "uppercase": unicode_data["uppercase"],
         "lowercase": unicode_data["lowercase"],
     }
     content = generate_tables_c(
         args.unicode_version,
         category_ranges,
         whitespace,
         case_mappings,
         nfd_mappings,
     )

     if args.check:
         if not output_path.exists():
             print(f"Error: {output_path} does not exist", file=sys.stderr)
             sys.exit(1)
         existing = output_path.read_text()
         if existing == content:
             print("Tables are up to date.")
             sys.exit(0)
         else:
             print(f"Error: {output_path} is out of date", file=sys.stderr)
             print("Run 'python unicode_tables_gen.py' to regenerate", file=sys.stderr)
             sys.exit(1)
     else:
         print(f"Writing {output_path}...")
         output_path.write_text(content)
         print(
             f"Done. Generated {len(category_ranges)} category ranges, "
             f"{len(whitespace)} whitespace codepoints, "
             f"{len(case_mappings['uppercase']) + len(case_mappings['lowercase'])} case mappings, "
             f"{len(nfd_mappings)} NFD mappings."
         )


 if __name__ == "__main__":
     main()
	#!/usr/bin/env python3
	# Copyright 2025 The IREE Authors
	#
	# Licensed under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	"""Generate Unicode lookup tables for IREE's unicode utilities.

	This script parses the Unicode Character Database (UCD) and generates
	the unicode_tables.c file with category ranges, whitespace codepoints,
	case mappings, and NFD decomposition tables.

	Usage:
	# Generate tables (downloads UCD if needed):
	python unicode_tables_gen.py

	# Check that existing tables match what would be generated:
	python unicode_tables_gen.py --check

	# Use a specific UCD version:
	python unicode_tables_gen.py --unicode-version 15.1.0

	# Use local UCD files:
	python unicode_tables_gen.py --ucd-dir /path/to/ucd
	"""

	import argparse
	import os
	import sys
	import urllib.request
	from pathlib import Path

	# Unicode Character Database URL template.
	UCD_URL_TEMPLATE = "https://www.unicode.org/Public/{version}/ucd/{file}"

	# Default Unicode version.
	DEFAULT_UNICODE_VERSION = "15.1.0"

	# Output file path (relative to repo root).
	OUTPUT_PATH = "runtime/src/iree/base/internal/unicode_tables.c"

	# Category flag values (must match unicode.h).
	CATEGORY_FLAGS = {
	"L": "CAT_L", # Letter
	"M": "CAT_M", # Mark
	"N": "CAT_N", # Number
	"P": "CAT_P", # Punctuation
	"S": "CAT_S", # Symbol
	"Z": "CAT_Z", # Separator
	"C": "CAT_C", # Other
	}


	def download_ucd_file(version: str, filename: str, cache_dir: Path) -> Path:
	"""Download a UCD file if not cached."""
	cache_path = cache_dir / version / filename
	if cache_path.exists():
	return cache_path

	url = UCD_URL_TEMPLATE.format(version=version, file=filename)
	print(f"Downloading {url}...")
	cache_path.parent.mkdir(parents=True, exist_ok=True)

	try:
	urllib.request.urlretrieve(url, cache_path)
	except Exception as e:
	print(f"Error downloading {url}: {e}", file=sys.stderr)
	sys.exit(1)

	return cache_path


	def parse_unicode_data(path: Path) -> dict:
	"""Parse UnicodeData.txt to extract categories and case mappings."""
	data = {
	"categories": {}, # codepoint -> category
	"uppercase": {}, # codepoint -> uppercase mapping
	"lowercase": {}, # codepoint -> lowercase mapping
	}

	with open(path, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line or line.startswith("#"):
	continue

	fields = line.split(";")
	if len(fields) < 15:
	continue

	codepoint = int(fields[0], 16)
	name = fields[1]
	category = fields[2]
	uppercase = fields[12]
	lowercase = fields[13]

	# Handle ranges (e.g., "<CJK Ideograph, First>").
	if name.endswith(", First>"):
	range_start = codepoint
	continue
	elif name.endswith(", Last>"):
	range_end = codepoint
	for cp in range(range_start, range_end + 1):
	data["categories"][cp] = category
	continue

	data["categories"][codepoint] = category

	if uppercase:
	data["uppercase"][codepoint] = int(uppercase, 16)
	if lowercase:
	data["lowercase"][codepoint] = int(lowercase, 16)

	return data


	def parse_prop_list(path: Path) -> set:
	"""Parse PropList.txt to extract White_Space property."""
	whitespace = set()

	with open(path, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line or line.startswith("#"):
	continue

	# Remove comments.
	if "#" in line:
	line = line[: line.index("#")].strip()

	parts = line.split(";")
	if len(parts) < 2:
	continue

	codepoint_range = parts[0].strip()
	prop = parts[1].strip()

	if prop != "White_Space":
	continue

	if ".." in codepoint_range:
	start, end = codepoint_range.split("..")
	for cp in range(int(start, 16), int(end, 16) + 1):
	whitespace.add(cp)
	else:
	whitespace.add(int(codepoint_range, 16))

	return whitespace


	def parse_unicode_data_for_decomposition(path: Path, categories: dict) -> dict:
	"""Parse UnicodeData.txt to extract NFD decompositions (simple 1:1 only).

	Args:
	path: Path to UnicodeData.txt
	categories: Dict mapping codepoint -> category (from parse_unicode_data)
	"""
	nfd = {}

	with open(path, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line or line.startswith("#"):
	continue

	fields = line.split(";")
	if len(fields) < 6:
	continue

	codepoint = int(fields[0], 16)
	decomposition = fields[5].strip()

	if not decomposition:
	continue

	# Skip compatibility decompositions (start with <tag>).
	if decomposition.startswith("<"):
	continue

	# Parse canonical decomposition.
	parts = decomposition.split()
	if len(parts) >= 1:
	# Take the first codepoint as the base character.
	base = int(parts[0], 16)
	# Only include if it's a simple 1:1 mapping to a base character.
	# Skip if the base is a combining mark (category M: Mn, Mc, Me).
	base_category = categories.get(base, "")
	if not base_category.startswith("M"):
	nfd[codepoint] = base

	return nfd


	def build_category_ranges(categories: dict) -> list:
	"""Build compact category ranges from individual codepoint categories."""
	# Group by major category (first letter).
	by_major = {}
	for cp, cat in sorted(categories.items()):
	major = cat[0] if cat else "C"
	if major not in by_major:
	by_major[major] = []
	by_major[major].append(cp)

	# Build ranges.
	ranges = []
	for major, codepoints in by_major.items():
	codepoints = sorted(set(codepoints))
	if not codepoints:
	continue

	# Merge consecutive codepoints into ranges.
	start = codepoints[0]
	end = start
	for cp in codepoints[1:]:
	if cp == end + 1:
	end = cp
	else:
	ranges.append((start, end, major))
	start = cp
	end = cp
	ranges.append((start, end, major))

	# Sort by start codepoint.
	ranges.sort(key=lambda r: r[0])

	# Filter to only include non-ASCII ranges (ASCII is handled inline).
	ranges = [(s, e, c) for s, e, c in ranges if s >= 0x80]

	return ranges


	def hex_compact(value: int) -> str:
	"""Format a hex value compactly (no unnecessary leading zeros, 0 for zero)."""
	if value == 0:
	return "0"
	return f"0x{value:X}"


	def pack_entries(
	entries: list, indent: str = " ", max_line_length: int = 100
	) -> list:
	"""Pack entries into lines up to max_line_length."""
	lines = []
	current_line = indent
	for entry in entries:
	if len(current_line) + len(entry) + 1 > max_line_length:
	lines.append(current_line.rstrip())
	current_line = indent
	current_line += entry + " "
	if current_line.strip():
	lines.append(current_line.rstrip())
	return lines


	def generate_tables_c(
	version: str,
	category_ranges: list,
	whitespace: set,
	case_mappings: dict,
	nfd_mappings: dict,
	) -> str:
	"""Generate the unicode_tables.c file content."""
	lines = []

	# Header with prominent disclaimer and clang-format off.
	lines.append(
	"""\
	// Copyright 2025 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	// clang-format off

	//===----------------------------------------------------------------------===//
	// ******************** DO NOT EDIT THIS FILE ********************
	//===----------------------------------------------------------------------===//
	//
	// Unicode lookup tables generated from the Unicode Character Database.
	//
	// Generated by: build_tools/scripts/unicode_tables_gen.py
	// Unicode version: {version}
	//
	// To regenerate:
	// python3 build_tools/scripts/unicode_tables_gen.py
	//
	//===----------------------------------------------------------------------===//

	#include "iree/base/internal/unicode.h"

	//===----------------------------------------------------------------------===//
	// Category range table
	//===----------------------------------------------------------------------===//

	#define L (1 << 0)
	#define M (1 << 1)
	#define N (1 << 2)
	#define P (1 << 3)
	#define S (1 << 4)
	#define Z (1 << 5)
	#define C (1 << 6)

	const iree_unicode_category_range_t iree_unicode_category_ranges[] = {{""".format(
	version=version
	)
	)

	# Category ranges - packed.
	cat_entries = []
	for start, end, category in category_ranges:
	cat_entries.append(f"{{{hex_compact(start)},{hex_compact(end)},{category}}},")
	lines.extend(pack_entries(cat_entries))

	lines.append(
	"""\
	};
	#undef L
	#undef M
	#undef N
	#undef P
	#undef S
	#undef Z
	#undef C

	const iree_host_size_t iree_unicode_category_ranges_count =
	sizeof(iree_unicode_category_ranges) / sizeof(iree_unicode_category_ranges[0]);

	//===----------------------------------------------------------------------===//
	// Whitespace codepoints (White_Space property)
	//===----------------------------------------------------------------------===//

	const uint32_t iree_unicode_whitespace_codepoints[] = {"""
	)

	# Whitespace (non-ASCII only, ASCII is handled inline) - packed.
	ws_entries = [f"{hex_compact(cp)}," for cp in sorted(whitespace) if cp >= 0x80]
	lines.extend(pack_entries(ws_entries))

	lines.append(
	"""\
	};

	const iree_host_size_t iree_unicode_whitespace_count =
	sizeof(iree_unicode_whitespace_codepoints) / sizeof(iree_unicode_whitespace_codepoints[0]);

	//===----------------------------------------------------------------------===//
	// Case mapping table
	//===----------------------------------------------------------------------===//

	const iree_unicode_case_mapping_t iree_unicode_case_mappings[] = {"""
	)

	# Case mappings (non-ASCII only) - packed.
	case_codepoints = sorted(
	set(case_mappings["uppercase"].keys()) \| set(case_mappings["lowercase"].keys())
	)
	case_entries = []
	for cp in case_codepoints:
	if cp < 0x80:
	continue
	lower = case_mappings["lowercase"].get(cp, 0)
	upper = case_mappings["uppercase"].get(cp, 0)
	case_entries.append(
	f"{{{hex_compact(cp)},{hex_compact(lower)},{hex_compact(upper)}}},"
	)
	lines.extend(pack_entries(case_entries))

	lines.append(
	"""\
	};

	const iree_host_size_t iree_unicode_case_mappings_count =
	sizeof(iree_unicode_case_mappings) / sizeof(iree_unicode_case_mappings[0]);

	//===----------------------------------------------------------------------===//
	// NFD base character table (simple 1:1 decomposition)
	//===----------------------------------------------------------------------===//

	const iree_unicode_nfd_mapping_t iree_unicode_nfd_mappings[] = {"""
	)

	# NFD mappings - packed.
	nfd_entries = []
	for cp in sorted(nfd_mappings.keys()):
	base = nfd_mappings[cp]
	nfd_entries.append(f"{{{hex_compact(cp)},{hex_compact(base)}}},")
	lines.extend(pack_entries(nfd_entries))

	lines.append(
	"""\
	};

	const iree_host_size_t iree_unicode_nfd_mappings_count =
	sizeof(iree_unicode_nfd_mappings) / sizeof(iree_unicode_nfd_mappings[0]);
	"""
	)

	return "\n".join(lines)


	def find_repo_root() -> Path:
	"""Find the IREE repository root."""
	current = Path(__file__).resolve()
	while current != current.parent:
	if (current / "runtime" / "src" / "iree").exists():
	return current
	current = current.parent
	print("Error: Could not find IREE repository root", file=sys.stderr)
	sys.exit(1)


	def main():
	parser = argparse.ArgumentParser(description="Generate Unicode lookup tables")
	parser.add_argument(
	"--unicode-version",
	default=DEFAULT_UNICODE_VERSION,
	help=f"Unicode version (default: {DEFAULT_UNICODE_VERSION})",
	)
	parser.add_argument(
	"--ucd-dir",
	type=Path,
	help="Path to local UCD directory (downloads if not specified)",
	)
	parser.add_argument(
	"--check",
	action="store_true",
	help="Check that existing tables match what would be generated",
	)
	parser.add_argument(
	"--output",
	type=Path,
	help="Output file path (default: auto-detect from repo root)",
	)
	args = parser.parse_args()

	repo_root = find_repo_root()
	output_path = args.output or repo_root / OUTPUT_PATH

	# Determine UCD file locations.
	if args.ucd_dir:
	ucd_dir = args.ucd_dir
	unicode_data_path = ucd_dir / "UnicodeData.txt"
	prop_list_path = ucd_dir / "PropList.txt"
	else:
	cache_dir = Path.home() / ".cache" / "iree" / "ucd"
	unicode_data_path = download_ucd_file(
	args.unicode_version, "UnicodeData.txt", cache_dir
	)
	prop_list_path = download_ucd_file(
	args.unicode_version, "PropList.txt", cache_dir
	)

	print(f"Parsing UnicodeData.txt...")
	unicode_data = parse_unicode_data(unicode_data_path)

	print(f"Parsing PropList.txt...")
	whitespace = parse_prop_list(prop_list_path)

	print(f"Parsing NFD decompositions...")
	nfd_mappings = parse_unicode_data_for_decomposition(
	unicode_data_path, unicode_data["categories"]
	)

	print(f"Building category ranges...")
	category_ranges = build_category_ranges(unicode_data["categories"])

	print(f"Generating tables...")
	case_mappings = {
	"uppercase": unicode_data["uppercase"],
	"lowercase": unicode_data["lowercase"],
	}
	content = generate_tables_c(
	args.unicode_version,
	category_ranges,
	whitespace,
	case_mappings,
	nfd_mappings,
	)

	if args.check:
	if not output_path.exists():
	print(f"Error: {output_path} does not exist", file=sys.stderr)
	sys.exit(1)
	existing = output_path.read_text()
	if existing == content:
	print("Tables are up to date.")
	sys.exit(0)
	else:
	print(f"Error: {output_path} is out of date", file=sys.stderr)
	print("Run 'python unicode_tables_gen.py' to regenerate", file=sys.stderr)
	sys.exit(1)
	else:
	print(f"Writing {output_path}...")
	output_path.write_text(content)
	print(
	f"Done. Generated {len(category_ranges)} category ranges, "
	f"{len(whitespace)} whitespace codepoints, "
	f"{len(case_mappings['uppercase']) + len(case_mappings['lowercase'])} case mappings, "
	f"{len(nfd_mappings)} NFD mappings."
	)


	if __name__ == "__main__":
	main()