| // Copyright 2026 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/tokenizer/segmenter/split.h" |
| |
| #include <string.h> |
| |
| //===----------------------------------------------------------------------===// |
| // Types |
| //===----------------------------------------------------------------------===// |
| |
| typedef struct iree_tokenizer_segmenter_split_t { |
| iree_tokenizer_segmenter_t base; |
| iree_allocator_t allocator; |
| iree_tokenizer_regex_split_behavior_t behavior; |
| bool invert; |
| // false = regex mode, true = literal mode. |
| bool is_literal; |
| union { |
| struct { |
| iree_tokenizer_regex_dfa_t dfa; |
| iree_tokenizer_regex_stride_t* stride; |
| uint8_t* dfa_data; |
| } regex; |
| struct { |
| iree_string_view_t pattern; // Points into pattern_data |
| // Owned copy of pattern bytes. |
| uint8_t* pattern_data; |
| } literal; |
| }; |
| } iree_tokenizer_segmenter_split_t; |
| |
| typedef struct iree_tokenizer_segmenter_split_state_t { |
| iree_tokenizer_segmenter_state_t base; |
| iree_tokenizer_regex_exec_state_t regex_state; |
| |
| // Position tracking (all absolute byte offsets). |
| iree_host_size_t bytes_processed; |
| iree_host_size_t last_emit_end; |
| |
| // Pending segment for MERGED_WITH_NEXT and CONTIGUOUS behaviors. |
| iree_host_size_t pending_start; |
| iree_host_size_t pending_end; |
| |
| // Cached from segmenter for hot path. |
| iree_tokenizer_regex_split_behavior_t behavior; |
| bool invert; |
| bool is_literal; |
| bool has_pending; |
| |
| // Set when process() caps consumed due to a pending regex match. The regex |
| // state was reset to avoid position underflow, but finalize() will re-scan |
| // the unconsumed bytes to emit the deferred match. has_pending() checks this |
| // flag to correctly indicate that finalize() is needed. |
| bool deferred_to_finalize; |
| |
| // Set after finalize's feed+regex phases complete. Prevents re-running them |
| // on re-entrant finalize calls (where only trailing flush needs to resume). |
| bool finalize_feed_done; |
| |
| // Literal mode: tracks partial match position when pattern spans chunks. |
| // Value is number of pattern bytes matched so far (0 = no partial match). |
| iree_host_size_t literal_match_position; |
| // Absolute position where the partial match started. |
| iree_host_size_t literal_match_start; |
| } iree_tokenizer_segmenter_split_state_t; |
| |
| // Result of processing one match. Handlers are pure functions that compute |
| // this result; the main loop commits state changes only if all emits succeed. |
| // Layout optimized: size_t fields first, then small fields packed at end. |
| typedef struct { |
| iree_host_size_t segments[4]; // [start0, end0, start1, end1] |
| iree_host_size_t pending_start; |
| iree_host_size_t pending_end; |
| iree_host_size_t last_emit_end; |
| // 0, 1, or 2 segments. |
| uint8_t segment_count; |
| bool has_pending; |
| } iree_tokenizer_split_match_result_t; |
| |
| //===----------------------------------------------------------------------===// |
| // Behavior Handlers (pure functions, no side effects) |
| //===----------------------------------------------------------------------===// |
| |
| // REMOVED: Emit gap (or match if inverted), discard the other. |
| // Normal mode (invert=false): pattern matches delimiters, emit gaps (content). |
| // Invert mode (invert=true): pattern matches tokens, emit matches (content). |
| static iree_tokenizer_split_match_result_t iree_tokenizer_split_handle_removed( |
| iree_host_size_t gap_start, iree_host_size_t gap_end, |
| iree_host_size_t match_start, iree_host_size_t match_end, |
| iree_host_size_t pending_start, iree_host_size_t pending_end, |
| bool has_pending, bool invert) { |
| (void)pending_start; |
| (void)pending_end; |
| (void)has_pending; |
| iree_tokenizer_split_match_result_t result = {0}; |
| if (invert) { |
| // Invert: emit the match (token), discard the gap (delimiter). |
| if (match_end > match_start) { |
| result.segments[0] = match_start; |
| result.segments[1] = match_end; |
| result.segment_count = 1; |
| } |
| } else { |
| // Normal: emit the gap (content), discard the match (delimiter). |
| if (gap_end > gap_start) { |
| result.segments[0] = gap_start; |
| result.segments[1] = gap_end; |
| result.segment_count = 1; |
| } |
| } |
| result.last_emit_end = match_end; |
| return result; |
| } |
| |
| // ISOLATED: Emit gap and match as separate segments. |
| // Order is always gap first, then match (preserves positional order). |
| // The invert flag doesn't change the behavior - both are always emitted. |
| static iree_tokenizer_split_match_result_t iree_tokenizer_split_handle_isolated( |
| iree_host_size_t gap_start, iree_host_size_t gap_end, |
| iree_host_size_t match_start, iree_host_size_t match_end, |
| iree_host_size_t pending_start, iree_host_size_t pending_end, |
| bool has_pending, bool invert) { |
| (void)pending_start; |
| (void)pending_end; |
| (void)has_pending; |
| (void)invert; // ISOLATED emits both regardless of invert. |
| |
| iree_tokenizer_split_match_result_t result = {0}; |
| if (gap_end > gap_start) { |
| result.segments[0] = gap_start; |
| result.segments[1] = gap_end; |
| result.segments[2] = match_start; |
| result.segments[3] = match_end; |
| result.segment_count = 2; |
| } else { |
| result.segments[0] = match_start; |
| result.segments[1] = match_end; |
| result.segment_count = 1; |
| } |
| result.last_emit_end = match_end; |
| return result; |
| } |
| |
| // MERGED_WITH_PREVIOUS: Emit [gap_start, match_end) as one segment. |
| // Merges the content before a delimiter with the delimiter itself. |
| // In invert mode, this merges the delimiter with the following token. |
| static iree_tokenizer_split_match_result_t |
| iree_tokenizer_split_handle_merged_with_previous(iree_host_size_t gap_start, |
| iree_host_size_t match_end, |
| iree_host_size_t pending_start, |
| iree_host_size_t pending_end, |
| bool has_pending, |
| bool invert) { |
| (void)pending_start; |
| (void)pending_end; |
| (void)has_pending; |
| (void)invert; // Same behavior: always merge [gap_start, match_end). |
| iree_tokenizer_split_match_result_t result = {0}; |
| result.segments[0] = gap_start; |
| result.segments[1] = match_end; |
| result.segment_count = 1; |
| result.last_emit_end = match_end; |
| return result; |
| } |
| |
| // MERGED_WITH_NEXT: Prepend pending to gap, buffer match for next. |
| // Normal: buffer delimiter (match), prepend to next content (gap). |
| // Invert: buffer delimiter (gap), prepend to next content (match). |
| static iree_tokenizer_split_match_result_t |
| iree_tokenizer_split_handle_merged_with_next(iree_host_size_t gap_start, |
| iree_host_size_t gap_end, |
| iree_host_size_t match_start, |
| iree_host_size_t match_end, |
| iree_host_size_t pending_start, |
| bool has_pending, bool invert) { |
| iree_tokenizer_split_match_result_t result = {0}; |
| if (invert) { |
| // Invert: emit pending+match (content), buffer gap (delimiter). |
| iree_host_size_t seg_start = has_pending ? pending_start : match_start; |
| if (match_end > seg_start) { |
| result.segments[0] = seg_start; |
| result.segments[1] = match_end; |
| result.segment_count = 1; |
| } |
| result.pending_start = gap_start; |
| result.pending_end = gap_end; |
| result.has_pending = (gap_end > gap_start); |
| } else { |
| // Normal: emit pending+gap (content), buffer match (delimiter). |
| iree_host_size_t seg_start = has_pending ? pending_start : gap_start; |
| if (gap_end > seg_start) { |
| result.segments[0] = seg_start; |
| result.segments[1] = gap_end; |
| result.segment_count = 1; |
| } |
| result.pending_start = match_start; |
| result.pending_end = match_end; |
| result.has_pending = true; |
| } |
| result.last_emit_end = match_end; |
| return result; |
| } |
| |
| // CONTIGUOUS: Merge consecutive delimiters, emit content between. |
| // Normal: merge consecutive matches (delimiters), emit gaps (content). |
| // Invert: merge consecutive gaps (delimiters), emit matches (content). |
| static iree_tokenizer_split_match_result_t |
| iree_tokenizer_split_handle_contiguous(iree_host_size_t gap_start, |
| iree_host_size_t gap_end, |
| iree_host_size_t match_start, |
| iree_host_size_t match_end, |
| iree_host_size_t pending_start, |
| iree_host_size_t pending_end, |
| bool has_pending, bool invert) { |
| iree_tokenizer_split_match_result_t result = {0}; |
| if (invert) { |
| // Invert: merge consecutive gaps (delimiters), emit matches (content). |
| if (has_pending && pending_end == gap_start && gap_end > gap_start) { |
| // Extend pending gap (consecutive delimiters). |
| result.pending_start = pending_start; |
| result.pending_end = gap_end; |
| result.has_pending = true; |
| } else { |
| // Emit pending (merged delimiters), emit match (content), buffer gap. |
| if (has_pending) { |
| result.segments[0] = pending_start; |
| result.segments[1] = pending_end; |
| result.segment_count = 1; |
| } |
| if (match_end > match_start) { |
| result.segments[result.segment_count * 2] = match_start; |
| result.segments[result.segment_count * 2 + 1] = match_end; |
| result.segment_count++; |
| } |
| if (gap_end > gap_start) { |
| result.pending_start = gap_start; |
| result.pending_end = gap_end; |
| result.has_pending = true; |
| } |
| } |
| } else { |
| // Normal: merge consecutive matches (delimiters), emit gaps (content). |
| if (has_pending && pending_end == match_start) { |
| // Extend pending match (consecutive delimiters). |
| result.pending_start = pending_start; |
| result.pending_end = match_end; |
| result.has_pending = true; |
| } else { |
| // Emit pending (merged delimiters), emit gap (content), buffer match. |
| if (has_pending) { |
| result.segments[0] = pending_start; |
| result.segments[1] = pending_end; |
| result.segment_count = 1; |
| } |
| if (gap_end > gap_start) { |
| result.segments[result.segment_count * 2] = gap_start; |
| result.segments[result.segment_count * 2 + 1] = gap_end; |
| result.segment_count++; |
| } |
| result.pending_start = match_start; |
| result.pending_end = match_end; |
| result.has_pending = true; |
| } |
| } |
| result.last_emit_end = match_end; |
| return result; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Split Processing |
| //===----------------------------------------------------------------------===// |
| |
| // Dispatches to the appropriate behavior handler. |
| IREE_ATTRIBUTE_ALWAYS_INLINE static inline iree_tokenizer_split_match_result_t |
| iree_tokenizer_split_handle_match( |
| iree_tokenizer_regex_split_behavior_t behavior, iree_host_size_t gap_start, |
| iree_host_size_t gap_end, iree_host_size_t match_start, |
| iree_host_size_t match_end, iree_host_size_t pending_start, |
| iree_host_size_t pending_end, bool has_pending, bool invert) { |
| switch (behavior) { |
| case IREE_TOKENIZER_UTIL_REGEX_SPLIT_REMOVED: |
| return iree_tokenizer_split_handle_removed( |
| gap_start, gap_end, match_start, match_end, pending_start, |
| pending_end, has_pending, invert); |
| case IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED: |
| return iree_tokenizer_split_handle_isolated( |
| gap_start, gap_end, match_start, match_end, pending_start, |
| pending_end, has_pending, invert); |
| case IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_PREVIOUS: |
| return iree_tokenizer_split_handle_merged_with_previous( |
| gap_start, match_end, pending_start, pending_end, has_pending, |
| invert); |
| case IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_NEXT: |
| return iree_tokenizer_split_handle_merged_with_next( |
| gap_start, gap_end, match_start, match_end, pending_start, |
| has_pending, invert); |
| case IREE_TOKENIZER_UTIL_REGEX_SPLIT_CONTIGUOUS: |
| return iree_tokenizer_split_handle_contiguous( |
| gap_start, gap_end, match_start, match_end, pending_start, |
| pending_end, has_pending, invert); |
| default: { |
| IREE_ASSERT(false && "invalid split behavior enum value"); |
| iree_tokenizer_split_match_result_t result = {0}; |
| return result; |
| } |
| } |
| } |
| |
| typedef struct { |
| iree_tokenizer_segment_output_t output; |
| iree_host_size_t chunk_base; |
| iree_host_size_t count; |
| // End of last successfully emitted segment. |
| iree_host_size_t last_end; |
| bool full; |
| } iree_tokenizer_split_emitter_t; |
| |
| // Emits a segment. Empty segments (start >= end) are skipped. |
| // Sets emitter->full on overflow; caller should check after batch of emits. |
| static inline void iree_tokenizer_split_emit( |
| iree_tokenizer_split_emitter_t* emitter, iree_host_size_t start, |
| iree_host_size_t end, bool absolute) { |
| if (start >= end || emitter->full) return; |
| if (emitter->count >= emitter->output.capacity) { |
| emitter->full = true; |
| return; |
| } |
| iree_host_size_t offset = absolute ? 0 : emitter->chunk_base; |
| emitter->output.values[emitter->count].start = start - offset; |
| emitter->output.values[emitter->count].end = end - offset; |
| emitter->count++; |
| emitter->last_end = end; |
| } |
| |
| // Emits all segments from a match result. Returns true if none overflowed. |
| static inline bool iree_tokenizer_split_emit_result( |
| iree_tokenizer_split_emitter_t* emitter, |
| const iree_tokenizer_split_match_result_t* result, bool absolute) { |
| for (uint8_t i = 0; i < result->segment_count; ++i) { |
| iree_tokenizer_split_emit(emitter, result->segments[i * 2], |
| result->segments[i * 2 + 1], absolute); |
| } |
| return !emitter->full; |
| } |
| |
| // Context for the inline callback, shared between process() and finalize(). |
| // Holds a transient copy of mutable state that is committed back to the |
| // persistent state struct after the regex feed/finalize completes. |
| typedef struct iree_tokenizer_split_callback_context_t { |
| iree_tokenizer_split_emitter_t* emitter; |
| // Mutable state (updated per-match on successful emit). |
| iree_host_size_t last_emit_end; |
| iree_host_size_t pending_start; |
| iree_host_size_t pending_end; |
| bool has_pending; |
| // Immutable config. |
| iree_tokenizer_regex_split_behavior_t behavior; |
| bool invert; |
| bool absolute; |
| } iree_tokenizer_split_callback_context_t; |
| |
| // Processes each regex match inline: computes behavior result, emits segments, |
| // and commits state. Returns RESOURCE_EXHAUSTED if output capacity is |
| // insufficient (the match is not committed and the regex will stop). |
| static inline iree_status_t iree_tokenizer_split_inline_callback( |
| void* user_data, iree_tokenizer_regex_match_t match) { |
| iree_tokenizer_split_callback_context_t* context = |
| (iree_tokenizer_split_callback_context_t*)user_data; |
| |
| // Compute behavior result (pure, no side effects). |
| // Gap is the region between the last emit and the current match start. |
| // Match is the matched region itself. |
| // The invert flag is passed to handlers so they can swap semantics internally |
| // (e.g., REMOVED emits match instead of gap when inverted). |
| iree_tokenizer_split_match_result_t result = |
| iree_tokenizer_split_handle_match( |
| context->behavior, context->last_emit_end, match.start, match.start, |
| match.end, context->pending_start, context->pending_end, |
| context->has_pending, context->invert); |
| |
| // Emit segments. If the emitter fills mid-result, the partially-emitted |
| // segments are committed (emitter.last_end tracks progress), and this match's |
| // full state changes are NOT committed to context (ensuring re-scan picks |
| // up the un-emitted segments). |
| if (!iree_tokenizer_split_emit_result(context->emitter, &result, |
| context->absolute)) { |
| return iree_status_from_code(IREE_STATUS_RESOURCE_EXHAUSTED); |
| } |
| |
| // All segments emitted successfully. Commit state. |
| context->last_emit_end = result.last_emit_end; |
| context->has_pending = result.has_pending; |
| context->pending_start = result.pending_start; |
| context->pending_end = result.pending_end; |
| |
| return iree_ok_status(); |
| } |
| |
| // Processes input with literal string matching. Returns a status and updates |
| // emitter/context state. Uses the same callback context structure as regex |
| // mode for consistency. |
| // |
| // The literal matching algorithm: |
| // 1. If we have a partial match from previous chunk, try to continue it. |
| // 2. Scan for new matches byte-by-byte. |
| // 3. Track partial matches at end of chunk that may continue next time. |
| static iree_status_t iree_tokenizer_split_literal_process( |
| const iree_tokenizer_segmenter_split_t* segmenter, |
| iree_tokenizer_segmenter_split_state_t* state, iree_string_view_t input, |
| iree_host_size_t chunk_base, |
| iree_tokenizer_split_callback_context_t* context) { |
| const char* pattern = segmenter->literal.pattern.data; |
| iree_host_size_t pattern_length = segmenter->literal.pattern.size; |
| |
| iree_host_size_t scan_position = 0; |
| |
| // Resume partial match from previous chunk if any. |
| if (state->literal_match_position > 0) { |
| iree_host_size_t remaining = pattern_length - state->literal_match_position; |
| iree_host_size_t can_check = |
| (remaining < input.size) ? remaining : input.size; |
| |
| // Check if the partial match continues. |
| if (memcmp(input.data, pattern + state->literal_match_position, |
| can_check) == 0) { |
| if (can_check == remaining) { |
| // Full match completed! Report the match. |
| iree_host_size_t match_start = state->literal_match_start; |
| iree_host_size_t match_end = chunk_base + can_check; |
| iree_tokenizer_regex_match_t match = {match_start, match_end}; |
| iree_status_t status = |
| iree_tokenizer_split_inline_callback(context, match); |
| if (!iree_status_is_ok(status)) { |
| // Reset partial match state since we're restarting. |
| state->literal_match_position = 0; |
| return status; |
| } |
| scan_position = can_check; |
| state->literal_match_position = 0; |
| } else { |
| // Still partial - need more input. |
| state->literal_match_position += can_check; |
| return iree_ok_status(); |
| } |
| } else { |
| // Partial match failed. We need to re-scan from after the false start. |
| // The bytes from literal_match_start are already consumed by previous |
| // chunks, so we just reset and continue scanning this chunk. |
| state->literal_match_position = 0; |
| } |
| } |
| |
| // Scan for new matches. |
| while (scan_position < input.size) { |
| // Look for the first character of the pattern. |
| const char* found = memchr(input.data + scan_position, pattern[0], |
| input.size - scan_position); |
| if (!found) { |
| // No more potential matches in this chunk. |
| break; |
| } |
| |
| iree_host_size_t match_offset = (iree_host_size_t)(found - input.data); |
| iree_host_size_t remaining_input = input.size - match_offset; |
| |
| if (remaining_input >= pattern_length) { |
| // Enough bytes to check full pattern. |
| if (memcmp(found, pattern, pattern_length) == 0) { |
| // Full match found. |
| iree_host_size_t match_start = chunk_base + match_offset; |
| iree_host_size_t match_end = match_start + pattern_length; |
| iree_tokenizer_regex_match_t match = {match_start, match_end}; |
| iree_status_t status = |
| iree_tokenizer_split_inline_callback(context, match); |
| if (!iree_status_is_ok(status)) { |
| return status; |
| } |
| scan_position = match_offset + pattern_length; |
| } else { |
| // Not a match, continue from next byte. |
| scan_position = match_offset + 1; |
| } |
| } else { |
| // Potential partial match at end of chunk. |
| if (memcmp(found, pattern, remaining_input) == 0) { |
| // Partial match - save state for next chunk. |
| state->literal_match_start = chunk_base + match_offset; |
| state->literal_match_position = remaining_input; |
| } |
| // Either way, we're done with this chunk. |
| break; |
| } |
| } |
| |
| return iree_ok_status(); |
| } |
| |
| // Finalizes literal matching. Handles any partial match from the last chunk. |
| static iree_status_t iree_tokenizer_split_literal_finalize( |
| const iree_tokenizer_segmenter_split_t* segmenter, |
| iree_tokenizer_segmenter_split_state_t* state, |
| iree_string_view_t remaining_input, iree_host_size_t chunk_base, |
| iree_tokenizer_split_callback_context_t* context) { |
| const char* pattern = segmenter->literal.pattern.data; |
| iree_host_size_t pattern_length = segmenter->literal.pattern.size; |
| |
| // Resume partial match if any. |
| if (state->literal_match_position > 0) { |
| iree_host_size_t remaining = pattern_length - state->literal_match_position; |
| if (remaining_input.size >= remaining && |
| memcmp(remaining_input.data, pattern + state->literal_match_position, |
| remaining) == 0) { |
| // Full match completed. |
| iree_host_size_t match_start = state->literal_match_start; |
| iree_host_size_t match_end = chunk_base + remaining; |
| iree_tokenizer_regex_match_t match = {match_start, match_end}; |
| iree_status_t status = |
| iree_tokenizer_split_inline_callback(context, match); |
| if (!iree_status_is_ok(status)) { |
| state->literal_match_position = 0; |
| return status; |
| } |
| // Continue scanning after the match. |
| remaining_input.data += remaining; |
| remaining_input.size -= remaining; |
| chunk_base += remaining; |
| } |
| state->literal_match_position = 0; |
| } |
| |
| // Scan remaining input for matches. |
| iree_host_size_t scan_position = 0; |
| while (scan_position < remaining_input.size) { |
| const char* found = memchr(remaining_input.data + scan_position, pattern[0], |
| remaining_input.size - scan_position); |
| if (!found) break; |
| |
| iree_host_size_t match_offset = |
| (iree_host_size_t)(found - remaining_input.data); |
| iree_host_size_t bytes_left = remaining_input.size - match_offset; |
| |
| if (bytes_left >= pattern_length && |
| memcmp(found, pattern, pattern_length) == 0) { |
| iree_host_size_t match_start = chunk_base + match_offset; |
| iree_host_size_t match_end = match_start + pattern_length; |
| iree_tokenizer_regex_match_t match = {match_start, match_end}; |
| iree_status_t status = |
| iree_tokenizer_split_inline_callback(context, match); |
| if (!iree_status_is_ok(status)) { |
| return status; |
| } |
| scan_position = match_offset + pattern_length; |
| } else { |
| scan_position = match_offset + 1; |
| } |
| } |
| |
| return iree_ok_status(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Vtable Implementation |
| //===----------------------------------------------------------------------===// |
| |
| static const iree_tokenizer_segmenter_vtable_t |
| iree_tokenizer_segmenter_split_vtable; |
| |
| iree_status_t iree_tokenizer_segmenter_split_allocate( |
| iree_tokenizer_regex_dfa_t dfa, uint8_t* dfa_data, |
| iree_tokenizer_regex_split_behavior_t behavior, bool invert, |
| iree_allocator_t allocator, iree_tokenizer_segmenter_t** out_segmenter) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| IREE_ASSERT_ARGUMENT(dfa_data); |
| IREE_ASSERT_ARGUMENT(out_segmenter); |
| *out_segmenter = NULL; |
| |
| iree_tokenizer_segmenter_split_t* segmenter = NULL; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, |
| iree_allocator_malloc(allocator, sizeof(*segmenter), (void**)&segmenter)); |
| |
| iree_tokenizer_segmenter_initialize( |
| &segmenter->base, &iree_tokenizer_segmenter_split_vtable, |
| sizeof(iree_tokenizer_segmenter_split_state_t)); |
| segmenter->allocator = allocator; |
| segmenter->behavior = behavior; |
| segmenter->invert = invert; |
| segmenter->is_literal = false; |
| segmenter->regex.dfa = dfa; |
| segmenter->regex.dfa_data = dfa_data; |
| |
| // Compute stride acceleration data for the DFA. |
| iree_status_t status = iree_tokenizer_regex_stride_allocate( |
| &segmenter->regex.dfa, allocator, &segmenter->regex.stride); |
| |
| if (iree_status_is_ok(status)) { |
| *out_segmenter = &segmenter->base; |
| } else { |
| iree_allocator_free(allocator, dfa_data); |
| iree_allocator_free(allocator, segmenter); |
| } |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| iree_status_t iree_tokenizer_segmenter_split_literal_allocate( |
| iree_string_view_t pattern, iree_tokenizer_regex_split_behavior_t behavior, |
| bool invert, iree_allocator_t allocator, |
| iree_tokenizer_segmenter_t** out_segmenter) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| IREE_ASSERT_ARGUMENT(out_segmenter); |
| *out_segmenter = NULL; |
| |
| if (pattern.size == 0) { |
| IREE_RETURN_AND_END_ZONE( |
| z0, iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "literal split pattern cannot be empty")); |
| } |
| |
| iree_tokenizer_segmenter_split_t* segmenter = NULL; |
| iree_status_t status = |
| iree_allocator_malloc(allocator, sizeof(*segmenter), (void**)&segmenter); |
| |
| // Allocate and copy pattern data. |
| uint8_t* pattern_data = NULL; |
| if (iree_status_is_ok(status)) { |
| status = |
| iree_allocator_malloc(allocator, pattern.size, (void**)&pattern_data); |
| } |
| |
| if (iree_status_is_ok(status)) { |
| memcpy(pattern_data, pattern.data, pattern.size); |
| iree_tokenizer_segmenter_initialize( |
| &segmenter->base, &iree_tokenizer_segmenter_split_vtable, |
| sizeof(iree_tokenizer_segmenter_split_state_t)); |
| segmenter->allocator = allocator; |
| segmenter->behavior = behavior; |
| segmenter->invert = invert; |
| segmenter->is_literal = true; |
| segmenter->literal.pattern_data = pattern_data; |
| segmenter->literal.pattern = |
| iree_make_string_view((const char*)pattern_data, pattern.size); |
| *out_segmenter = &segmenter->base; |
| } else { |
| iree_allocator_free(allocator, pattern_data); |
| iree_allocator_free(allocator, segmenter); |
| } |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| static void iree_tokenizer_segmenter_split_destroy( |
| iree_tokenizer_segmenter_t* segmenter) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| iree_tokenizer_segmenter_split_t* self = |
| (iree_tokenizer_segmenter_split_t*)segmenter; |
| iree_allocator_t allocator = self->allocator; |
| if (self->is_literal) { |
| if (self->literal.pattern_data) { |
| iree_allocator_free(allocator, self->literal.pattern_data); |
| } |
| } else { |
| iree_tokenizer_regex_stride_free(self->regex.stride, allocator); |
| if (self->regex.dfa_data) { |
| iree_allocator_free(allocator, self->regex.dfa_data); |
| } |
| } |
| iree_allocator_free(allocator, self); |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| static iree_status_t iree_tokenizer_segmenter_split_state_initialize( |
| const iree_tokenizer_segmenter_t* segmenter, void* storage, |
| iree_tokenizer_segmenter_state_t** out_state) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| const iree_tokenizer_segmenter_split_t* self = |
| (const iree_tokenizer_segmenter_split_t*)segmenter; |
| iree_tokenizer_segmenter_split_state_t* state = |
| (iree_tokenizer_segmenter_split_state_t*)storage; |
| |
| memset(state, 0, sizeof(*state)); |
| state->base.segmenter = segmenter; |
| state->behavior = self->behavior; |
| state->invert = self->invert; |
| state->is_literal = self->is_literal; |
| if (!self->is_literal) { |
| iree_tokenizer_regex_exec_initialize(&state->regex_state, &self->regex.dfa); |
| } |
| |
| *out_state = &state->base; |
| IREE_TRACE_ZONE_END(z0); |
| return iree_ok_status(); |
| } |
| |
| static void iree_tokenizer_segmenter_split_state_deinitialize( |
| iree_tokenizer_segmenter_state_t* state) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| (void)state; |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| static iree_status_t iree_tokenizer_segmenter_split_state_process( |
| iree_tokenizer_segmenter_state_t* state, iree_string_view_t input, |
| iree_tokenizer_segment_output_t output, iree_host_size_t* out_consumed, |
| iree_host_size_t* out_segment_count) { |
| iree_tokenizer_segmenter_split_state_t* self = |
| (iree_tokenizer_segmenter_split_state_t*)state; |
| const iree_tokenizer_segmenter_split_t* segmenter = |
| (const iree_tokenizer_segmenter_split_t*)state->segmenter; |
| |
| *out_consumed = 0; |
| *out_segment_count = 0; |
| |
| if (input.size == 0 || output.capacity == 0) { |
| return iree_ok_status(); |
| } |
| |
| iree_host_size_t chunk_base = self->bytes_processed; |
| |
| // Set up emitter and callback context. The context holds a transient copy |
| // of mutable state that the inline callback updates per-match. |
| iree_tokenizer_split_emitter_t emitter = { |
| .output = output, |
| .chunk_base = chunk_base, |
| .count = 0, |
| .last_end = self->last_emit_end, |
| .full = false, |
| }; |
| iree_tokenizer_split_callback_context_t context = { |
| .emitter = &emitter, |
| .last_emit_end = self->last_emit_end, |
| .pending_start = self->pending_start, |
| .pending_end = self->pending_end, |
| .has_pending = self->has_pending, |
| .behavior = self->behavior, |
| .invert = self->invert, |
| .absolute = false, |
| }; |
| |
| // Feed pattern matcher; matches are processed inline via callback. |
| iree_status_t feed_status; |
| if (self->is_literal) { |
| feed_status = iree_tokenizer_split_literal_process(segmenter, self, input, |
| chunk_base, &context); |
| } else { |
| feed_status = iree_tokenizer_regex_exec_feed( |
| &segmenter->regex.dfa, &self->regex_state, input, chunk_base, |
| segmenter->regex.stride, iree_tokenizer_split_inline_callback, |
| &context); |
| } |
| if (!iree_status_is_ok(feed_status) && |
| !iree_status_is_resource_exhausted(feed_status)) { |
| return feed_status; |
| } |
| if (iree_status_is_resource_exhausted(feed_status)) { |
| emitter.full = true; |
| } |
| iree_status_ignore(feed_status); |
| |
| // Determine consumption and commit state based on output capacity. |
| if (emitter.full && emitter.last_end > chunk_base) { |
| // Output filled mid-chunk: consume up to last emitted segment. |
| // Discard any pending state accumulated after the last emit — those bytes |
| // are beyond our consumption point and will be re-fed on the next call. |
| *out_consumed = emitter.last_end - chunk_base; |
| self->bytes_processed = emitter.last_end; |
| self->last_emit_end = emitter.last_end; |
| self->has_pending = false; |
| self->literal_match_position = 0; |
| if (!self->is_literal) { |
| iree_tokenizer_regex_exec_initialize(&self->regex_state, |
| &segmenter->regex.dfa); |
| } |
| } else if (emitter.full) { |
| // Output filled but no progress (capacity too small for first result). |
| // Leave persistent state unchanged — nothing was consumed or emitted. |
| *out_consumed = 0; |
| self->literal_match_position = 0; |
| if (!self->is_literal) { |
| iree_tokenizer_regex_exec_initialize(&self->regex_state, |
| &segmenter->regex.dfa); |
| } |
| } else { |
| // Normal case: consume all input, commit full callback state. |
| self->bytes_processed = chunk_base + input.size; |
| self->last_emit_end = context.last_emit_end; |
| self->has_pending = context.has_pending; |
| self->pending_start = context.pending_start; |
| self->pending_end = context.pending_end; |
| |
| // Check if the regex has a pending match that finalize() will need to emit. |
| // A pending regex match exists when in_match is true AND there's an |
| // accepting position (either lookahead-passed or fallback). |
| // |
| // CRITICAL: When there's a pending regex match, we must NOT consume past |
| // last_emit_end. The regex tracks match positions cumulatively from init, |
| // but finalize() uses chunk_base=bytes_processed for position arithmetic. |
| // If we consume past the pending match's start position, finalize() will |
| // compute start - chunk_base = underflow. |
| // |
| // Example: Input " if" matches [0,3) and [3,6). The [3,6) match is |
| // pending after process() (greedy match waits for confirming byte). |
| // If we set bytes_processed=6, finalize() gets chunk_base=6 and computes |
| // 3 - 6 = underflow. By capping at last_emit_end=3, finalize() gets |
| // chunk_base=3 and re-scans " if", producing the correct [0,3) relative |
| // position (absolute [3,6)). |
| bool has_pending_regex_match = |
| !self->is_literal && self->regex_state.in_match && |
| (self->regex_state.has_accept || self->regex_state.has_accept_fallback); |
| |
| // Check if we should use wraparound arithmetic for this behavior. |
| // ISOLATED+invert needs wraparound because trailing gaps must be emitted |
| // and has_pending relies on last_emit_end < bytes_processed. |
| bool uses_wraparound = |
| self->invert && |
| self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED; |
| |
| // Determine if we need to cap consumed bytes: |
| // 1. Pending regex match: ALWAYS cap to avoid position underflow |
| // 2. Trailing data: cap when not handled by pending segment logic |
| bool has_trailing_data = self->last_emit_end < self->bytes_processed; |
| bool should_cap_trailing = |
| !self->has_pending && !uses_wraparound && has_trailing_data; |
| |
| if (has_pending_regex_match || should_cap_trailing) { |
| // Cap consumed at last emitted segment. The unconsumed bytes will be |
| // passed to finalize(), which can re-scan them with correct chunk_base. |
| *out_consumed = self->last_emit_end - chunk_base; |
| self->bytes_processed = self->last_emit_end; |
| self->literal_match_position = 0; |
| if (!self->is_literal) { |
| iree_tokenizer_regex_exec_initialize(&self->regex_state, |
| &segmenter->regex.dfa); |
| } |
| // Set deferred flag only when capping specifically for the pending regex |
| // match case that previously used wraparound arithmetic. This preserves |
| // has_pending()=true for ISOLATED+invert mode, which relies on it. |
| // For other cases (should_cap_trailing), the original behavior was |
| // has_pending()=false after capping. |
| self->deferred_to_finalize = has_pending_regex_match && uses_wraparound; |
| } else { |
| *out_consumed = input.size; |
| self->deferred_to_finalize = false; |
| } |
| } |
| |
| *out_segment_count = emitter.count; |
| return iree_ok_status(); |
| } |
| |
| static iree_status_t iree_tokenizer_segmenter_split_state_finalize( |
| iree_tokenizer_segmenter_state_t* state, iree_string_view_t remaining_input, |
| iree_tokenizer_segment_output_t output, |
| iree_host_size_t* out_segment_count) { |
| iree_tokenizer_segmenter_split_state_t* self = |
| (iree_tokenizer_segmenter_split_state_t*)state; |
| const iree_tokenizer_segmenter_split_t* segmenter = |
| (const iree_tokenizer_segmenter_split_t*)state->segmenter; |
| |
| *out_segment_count = 0; |
| |
| iree_host_size_t chunk_base = self->bytes_processed; |
| |
| // Set up emitter and callback context (same pattern as process). |
| iree_tokenizer_split_emitter_t emitter = { |
| .output = output, |
| .chunk_base = chunk_base, |
| .count = 0, |
| .last_end = self->last_emit_end, |
| .full = false, |
| }; |
| iree_tokenizer_split_callback_context_t context = { |
| .emitter = &emitter, |
| .last_emit_end = self->last_emit_end, |
| .pending_start = self->pending_start, |
| .pending_end = self->pending_end, |
| .has_pending = self->has_pending, |
| .behavior = self->behavior, |
| .invert = self->invert, |
| .absolute = false, |
| }; |
| |
| iree_host_size_t input_end = chunk_base + remaining_input.size; |
| |
| // Phase 1: Feed remaining input and finalize pattern matcher. This phase |
| // runs once; on re-entrant calls (after trailing flush overflow), it is |
| // skipped. |
| if (!self->finalize_feed_done) { |
| if (self->is_literal) { |
| // Literal mode: finalize handles partial matches and remaining input. |
| iree_status_t finalize_status = iree_tokenizer_split_literal_finalize( |
| segmenter, self, remaining_input, chunk_base, &context); |
| if (iree_status_is_resource_exhausted(finalize_status)) { |
| // Output full - translate to emitter.full flag. |
| iree_status_ignore(finalize_status); |
| emitter.full = true; |
| } else if (!iree_status_is_ok(finalize_status)) { |
| return finalize_status; |
| } |
| } else { |
| // Regex mode: feed remaining input through regex. |
| if (remaining_input.size > 0) { |
| iree_status_t feed_status = iree_tokenizer_regex_exec_feed( |
| &segmenter->regex.dfa, &self->regex_state, remaining_input, |
| chunk_base, segmenter->regex.stride, |
| iree_tokenizer_split_inline_callback, &context); |
| if (iree_status_is_resource_exhausted(feed_status)) { |
| // Output full during feed - preserve regex state for re-entry. |
| iree_status_ignore(feed_status); |
| emitter.full = true; |
| } else if (!iree_status_is_ok(feed_status)) { |
| return feed_status; |
| } |
| } |
| |
| // Finalize regex (may produce one final match at end-of-input). |
| // Skip if output already full. |
| if (!emitter.full) { |
| iree_status_t finalize_status = iree_tokenizer_regex_exec_finalize( |
| &segmenter->regex.dfa, &self->regex_state, input_end, |
| iree_tokenizer_split_inline_callback, &context); |
| if (iree_status_is_ok(finalize_status)) { |
| iree_tokenizer_regex_exec_initialize(&self->regex_state, |
| &segmenter->regex.dfa); |
| } else if (iree_status_is_resource_exhausted(finalize_status)) { |
| // Final match couldn't be emitted - preserve regex state for |
| // re-entry. |
| iree_status_ignore(finalize_status); |
| emitter.full = true; |
| } else { |
| return finalize_status; |
| } |
| } |
| } |
| |
| // Commit progress. When full, commit partial progress so re-entry doesn't |
| // re-emit. When complete, commit full progress and mark phase done. |
| if (emitter.full) { |
| self->last_emit_end = emitter.last_end; |
| } else { |
| self->last_emit_end = context.last_emit_end; |
| self->has_pending = context.has_pending; |
| self->pending_start = context.pending_start; |
| self->pending_end = context.pending_end; |
| self->finalize_feed_done = true; |
| self->deferred_to_finalize = false; // Deferred regex work is done. |
| } |
| } |
| |
| // Phase 2: Flush pending and trailing gap. Each emit may overflow, so state |
| // updates are guarded per-emit. On re-entry, self's state reflects what was |
| // already emitted (has_pending=false after pending emitted, last_emit_end |
| // advanced after each segment). |
| // |
| // Trailing gap emission depends on mode: |
| // - Normal mode: always emit trailing gap (it's content) |
| // - Invert mode + ISOLATED: emit trailing gap (ISOLATED emits everything) |
| // - Invert mode + others: skip trailing gap (it's a delimiter) |
| bool emit_trailing_gap = |
| !self->invert || |
| self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED; |
| iree_host_size_t trailing_end = input_end; |
| if (!emitter.full && self->has_pending) { |
| if (self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_NEXT) { |
| // MERGED_WITH_NEXT: merge pending delimiter with trailing content. |
| // In invert mode, pending is a delimiter (gap), trailing is content. |
| // Only merge if there's trailing content OR emit_trailing_gap is true. |
| if (emit_trailing_gap || trailing_end > self->pending_end) { |
| iree_tokenizer_split_emit(&emitter, self->pending_start, trailing_end, |
| false); |
| } else { |
| // Invert mode: pending is delimiter with no content to merge, discard. |
| iree_tokenizer_split_emit(&emitter, self->pending_start, |
| self->pending_end, false); |
| } |
| if (!emitter.full) { |
| self->has_pending = false; |
| self->last_emit_end = trailing_end; |
| } |
| } else { |
| iree_tokenizer_split_emit(&emitter, self->pending_start, |
| self->pending_end, false); |
| if (!emitter.full) { |
| self->has_pending = false; |
| self->last_emit_end = self->pending_end; |
| if (emit_trailing_gap) { |
| iree_tokenizer_split_emit(&emitter, self->last_emit_end, trailing_end, |
| false); |
| } |
| if (!emitter.full) { |
| self->last_emit_end = trailing_end; |
| } |
| } |
| } |
| } else if (!emitter.full) { |
| if (emit_trailing_gap) { |
| iree_tokenizer_split_emit(&emitter, self->last_emit_end, trailing_end, |
| false); |
| } |
| if (!emitter.full) { |
| self->last_emit_end = trailing_end; |
| } |
| } |
| |
| *out_segment_count = emitter.count; |
| if (emitter.full) { |
| // Commit partial progress so re-entry doesn't re-emit. Caller should check |
| // has_pending() to know if re-entry is needed. |
| self->last_emit_end = emitter.last_end; |
| return iree_ok_status(); |
| } |
| // All phases complete. Advance bytes_processed and reset for potential reuse. |
| self->bytes_processed = input_end; |
| self->finalize_feed_done = false; |
| return iree_ok_status(); |
| } |
| |
| static bool iree_tokenizer_segmenter_split_state_has_pending( |
| const iree_tokenizer_segmenter_state_t* state) { |
| const iree_tokenizer_segmenter_split_state_t* self = |
| (const iree_tokenizer_segmenter_split_state_t*)state; |
| |
| // Check if there's a buffered pending segment. |
| if (self->has_pending) return true; |
| |
| // Check if process() capped consumed due to a pending regex match. |
| // The regex state was reset, but finalize() will re-scan the unconsumed |
| // bytes to emit the deferred match. |
| if (self->deferred_to_finalize) return true; |
| |
| // Check if the regex has a partial match that finalize() will complete. |
| // This is critical for patterns like \w+ where the match extends to end of |
| // input but the regex stays in "in_match" state waiting for more input. |
| if (!self->is_literal && self->regex_state.in_match && |
| (self->regex_state.has_accept || self->regex_state.has_accept_fallback)) { |
| return true; |
| } |
| |
| // Check if finalize() will emit trailing data. |
| // In normal mode, trailing gaps (content) are emitted. |
| // In invert mode, trailing gaps (delimiters) are only emitted for ISOLATED. |
| if (self->last_emit_end < self->bytes_processed) { |
| bool emit_trailing = |
| !self->invert || |
| self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED; |
| return emit_trailing; |
| } |
| |
| return false; |
| } |
| |
| static iree_status_t iree_tokenizer_segmenter_split_state_flush( |
| iree_tokenizer_segmenter_state_t* state, |
| iree_tokenizer_segment_output_t output, iree_host_size_t* out_segment_count, |
| iree_host_size_t* out_bytes_committed) { |
| iree_tokenizer_segmenter_split_state_t* self = |
| (iree_tokenizer_segmenter_split_state_t*)state; |
| const iree_tokenizer_segmenter_split_t* segmenter = |
| (const iree_tokenizer_segmenter_split_t*)state->segmenter; |
| *out_segment_count = 0; |
| *out_bytes_committed = self->last_emit_end; |
| |
| if (output.capacity == 0) { |
| return iree_ok_status(); |
| } |
| |
| iree_tokenizer_split_emitter_t emitter = { |
| .output = output, |
| .chunk_base = 0, // Segments are absolute buffer positions. |
| .count = 0, |
| .last_end = self->last_emit_end, |
| .full = false, |
| }; |
| |
| // If we have an accepted match, emit it first. |
| if (self->regex_state.in_match && |
| (self->regex_state.has_accept || self->regex_state.has_accept_fallback)) { |
| iree_host_size_t match_end = self->regex_state.has_accept |
| ? self->regex_state.last_accept |
| : self->regex_state.last_accept_fallback; |
| |
| // Gap is between last emit and match start, match is the regex match. |
| // Invert flag is passed to handler to swap semantics internally. |
| iree_tokenizer_split_match_result_t result = |
| iree_tokenizer_split_handle_match( |
| self->behavior, self->last_emit_end, self->regex_state.match_start, |
| self->regex_state.match_start, match_end, self->pending_start, |
| self->pending_end, self->has_pending, self->invert); |
| |
| if (!iree_tokenizer_split_emit_result(&emitter, &result, true)) { |
| *out_segment_count = emitter.count; |
| *out_bytes_committed = emitter.last_end; |
| return iree_ok_status(); |
| } |
| |
| self->last_emit_end = result.last_emit_end; |
| self->has_pending = result.has_pending; |
| self->pending_start = result.pending_start; |
| self->pending_end = result.pending_end; |
| } |
| |
| // Emit trailing gap up to bytes_processed, like finalize does. |
| // This ensures we commit all scanned bytes so the buffer can compact. |
| // State updates are guarded per-emit for correctness of out_bytes_committed. |
| // |
| // Trailing gap emission depends on mode (same logic as finalize). |
| bool emit_trailing_gap = |
| !self->invert || |
| self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED; |
| iree_host_size_t trailing_end = self->bytes_processed; |
| if (!emitter.full && trailing_end > self->last_emit_end) { |
| if (self->has_pending) { |
| if (self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_NEXT) { |
| if (emit_trailing_gap || trailing_end > self->pending_end) { |
| iree_tokenizer_split_emit(&emitter, self->pending_start, trailing_end, |
| false); |
| } else { |
| iree_tokenizer_split_emit(&emitter, self->pending_start, |
| self->pending_end, false); |
| } |
| if (!emitter.full) { |
| self->has_pending = false; |
| self->last_emit_end = trailing_end; |
| } |
| } else { |
| iree_tokenizer_split_emit(&emitter, self->pending_start, |
| self->pending_end, false); |
| if (!emitter.full) { |
| self->has_pending = false; |
| self->last_emit_end = self->pending_end; |
| if (emit_trailing_gap) { |
| iree_tokenizer_split_emit(&emitter, self->last_emit_end, |
| trailing_end, false); |
| } |
| if (!emitter.full) { |
| self->last_emit_end = trailing_end; |
| } |
| } |
| } |
| } else { |
| if (emit_trailing_gap) { |
| iree_tokenizer_split_emit(&emitter, self->last_emit_end, trailing_end, |
| false); |
| } |
| if (!emitter.full) { |
| self->last_emit_end = trailing_end; |
| } |
| } |
| } |
| |
| // Check if output is full BEFORE resetting state. This matches finalize()'s |
| // pattern: only reset state when all emissions succeeded. If full, return |
| // partial results and preserve state so caller can retry with more capacity. |
| *out_segment_count = emitter.count; |
| *out_bytes_committed = self->last_emit_end; |
| if (emitter.full) { |
| return iree_ok_status(); |
| } |
| |
| // All emissions succeeded. Reset pattern matcher state for fresh input. |
| if (self->is_literal) { |
| self->literal_match_position = 0; |
| } else { |
| iree_tokenizer_regex_exec_initialize(&self->regex_state, |
| &segmenter->regex.dfa); |
| } |
| |
| // Reset position tracking to 0. After flush, the tokenizer compacts the |
| // buffer and refills from position 0, so our next process() call will |
| // receive a fresh chunk at offset 0. |
| self->bytes_processed = 0; |
| self->last_emit_end = 0; |
| |
| return iree_ok_status(); |
| } |
| |
| static const iree_tokenizer_segmenter_vtable_t |
| iree_tokenizer_segmenter_split_vtable = { |
| .destroy = iree_tokenizer_segmenter_split_destroy, |
| .state_initialize = iree_tokenizer_segmenter_split_state_initialize, |
| .state_deinitialize = iree_tokenizer_segmenter_split_state_deinitialize, |
| .state_process = iree_tokenizer_segmenter_split_state_process, |
| .state_finalize = iree_tokenizer_segmenter_split_state_finalize, |
| .state_has_pending = iree_tokenizer_segmenter_split_state_has_pending, |
| .state_flush = iree_tokenizer_segmenter_split_state_flush, |
| }; |