blob: 77d3ddad8546c12b40b409e6bb50e44422541167 [file]
// Copyright 2026 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/tokenizer/segmenter/split.h"
#include <string.h>
//===----------------------------------------------------------------------===//
// Types
//===----------------------------------------------------------------------===//
typedef struct iree_tokenizer_segmenter_split_t {
iree_tokenizer_segmenter_t base;
iree_allocator_t allocator;
iree_tokenizer_regex_split_behavior_t behavior;
bool invert;
// false = regex mode, true = literal mode.
bool is_literal;
union {
struct {
iree_tokenizer_regex_dfa_t dfa;
iree_tokenizer_regex_stride_t* stride;
uint8_t* dfa_data;
} regex;
struct {
iree_string_view_t pattern; // Points into pattern_data
// Owned copy of pattern bytes.
uint8_t* pattern_data;
} literal;
};
} iree_tokenizer_segmenter_split_t;
typedef struct iree_tokenizer_segmenter_split_state_t {
iree_tokenizer_segmenter_state_t base;
iree_tokenizer_regex_exec_state_t regex_state;
// Position tracking (all absolute byte offsets).
iree_host_size_t bytes_processed;
iree_host_size_t last_emit_end;
// Pending segment for MERGED_WITH_NEXT and CONTIGUOUS behaviors.
iree_host_size_t pending_start;
iree_host_size_t pending_end;
// Cached from segmenter for hot path.
iree_tokenizer_regex_split_behavior_t behavior;
bool invert;
bool is_literal;
bool has_pending;
// Set when process() caps consumed due to a pending regex match. The regex
// state was reset to avoid position underflow, but finalize() will re-scan
// the unconsumed bytes to emit the deferred match. has_pending() checks this
// flag to correctly indicate that finalize() is needed.
bool deferred_to_finalize;
// Set after finalize's feed+regex phases complete. Prevents re-running them
// on re-entrant finalize calls (where only trailing flush needs to resume).
bool finalize_feed_done;
// Literal mode: tracks partial match position when pattern spans chunks.
// Value is number of pattern bytes matched so far (0 = no partial match).
iree_host_size_t literal_match_position;
// Absolute position where the partial match started.
iree_host_size_t literal_match_start;
} iree_tokenizer_segmenter_split_state_t;
// Result of processing one match. Handlers are pure functions that compute
// this result; the main loop commits state changes only if all emits succeed.
// Layout optimized: size_t fields first, then small fields packed at end.
typedef struct {
iree_host_size_t segments[4]; // [start0, end0, start1, end1]
iree_host_size_t pending_start;
iree_host_size_t pending_end;
iree_host_size_t last_emit_end;
// 0, 1, or 2 segments.
uint8_t segment_count;
bool has_pending;
} iree_tokenizer_split_match_result_t;
//===----------------------------------------------------------------------===//
// Behavior Handlers (pure functions, no side effects)
//===----------------------------------------------------------------------===//
// REMOVED: Emit gap (or match if inverted), discard the other.
// Normal mode (invert=false): pattern matches delimiters, emit gaps (content).
// Invert mode (invert=true): pattern matches tokens, emit matches (content).
static iree_tokenizer_split_match_result_t iree_tokenizer_split_handle_removed(
iree_host_size_t gap_start, iree_host_size_t gap_end,
iree_host_size_t match_start, iree_host_size_t match_end,
iree_host_size_t pending_start, iree_host_size_t pending_end,
bool has_pending, bool invert) {
(void)pending_start;
(void)pending_end;
(void)has_pending;
iree_tokenizer_split_match_result_t result = {0};
if (invert) {
// Invert: emit the match (token), discard the gap (delimiter).
if (match_end > match_start) {
result.segments[0] = match_start;
result.segments[1] = match_end;
result.segment_count = 1;
}
} else {
// Normal: emit the gap (content), discard the match (delimiter).
if (gap_end > gap_start) {
result.segments[0] = gap_start;
result.segments[1] = gap_end;
result.segment_count = 1;
}
}
result.last_emit_end = match_end;
return result;
}
// ISOLATED: Emit gap and match as separate segments.
// Order is always gap first, then match (preserves positional order).
// The invert flag doesn't change the behavior - both are always emitted.
static iree_tokenizer_split_match_result_t iree_tokenizer_split_handle_isolated(
iree_host_size_t gap_start, iree_host_size_t gap_end,
iree_host_size_t match_start, iree_host_size_t match_end,
iree_host_size_t pending_start, iree_host_size_t pending_end,
bool has_pending, bool invert) {
(void)pending_start;
(void)pending_end;
(void)has_pending;
(void)invert; // ISOLATED emits both regardless of invert.
iree_tokenizer_split_match_result_t result = {0};
if (gap_end > gap_start) {
result.segments[0] = gap_start;
result.segments[1] = gap_end;
result.segments[2] = match_start;
result.segments[3] = match_end;
result.segment_count = 2;
} else {
result.segments[0] = match_start;
result.segments[1] = match_end;
result.segment_count = 1;
}
result.last_emit_end = match_end;
return result;
}
// MERGED_WITH_PREVIOUS: Emit [gap_start, match_end) as one segment.
// Merges the content before a delimiter with the delimiter itself.
// In invert mode, this merges the delimiter with the following token.
static iree_tokenizer_split_match_result_t
iree_tokenizer_split_handle_merged_with_previous(iree_host_size_t gap_start,
iree_host_size_t match_end,
iree_host_size_t pending_start,
iree_host_size_t pending_end,
bool has_pending,
bool invert) {
(void)pending_start;
(void)pending_end;
(void)has_pending;
(void)invert; // Same behavior: always merge [gap_start, match_end).
iree_tokenizer_split_match_result_t result = {0};
result.segments[0] = gap_start;
result.segments[1] = match_end;
result.segment_count = 1;
result.last_emit_end = match_end;
return result;
}
// MERGED_WITH_NEXT: Prepend pending to gap, buffer match for next.
// Normal: buffer delimiter (match), prepend to next content (gap).
// Invert: buffer delimiter (gap), prepend to next content (match).
static iree_tokenizer_split_match_result_t
iree_tokenizer_split_handle_merged_with_next(iree_host_size_t gap_start,
iree_host_size_t gap_end,
iree_host_size_t match_start,
iree_host_size_t match_end,
iree_host_size_t pending_start,
bool has_pending, bool invert) {
iree_tokenizer_split_match_result_t result = {0};
if (invert) {
// Invert: emit pending+match (content), buffer gap (delimiter).
iree_host_size_t seg_start = has_pending ? pending_start : match_start;
if (match_end > seg_start) {
result.segments[0] = seg_start;
result.segments[1] = match_end;
result.segment_count = 1;
}
result.pending_start = gap_start;
result.pending_end = gap_end;
result.has_pending = (gap_end > gap_start);
} else {
// Normal: emit pending+gap (content), buffer match (delimiter).
iree_host_size_t seg_start = has_pending ? pending_start : gap_start;
if (gap_end > seg_start) {
result.segments[0] = seg_start;
result.segments[1] = gap_end;
result.segment_count = 1;
}
result.pending_start = match_start;
result.pending_end = match_end;
result.has_pending = true;
}
result.last_emit_end = match_end;
return result;
}
// CONTIGUOUS: Merge consecutive delimiters, emit content between.
// Normal: merge consecutive matches (delimiters), emit gaps (content).
// Invert: merge consecutive gaps (delimiters), emit matches (content).
static iree_tokenizer_split_match_result_t
iree_tokenizer_split_handle_contiguous(iree_host_size_t gap_start,
iree_host_size_t gap_end,
iree_host_size_t match_start,
iree_host_size_t match_end,
iree_host_size_t pending_start,
iree_host_size_t pending_end,
bool has_pending, bool invert) {
iree_tokenizer_split_match_result_t result = {0};
if (invert) {
// Invert: merge consecutive gaps (delimiters), emit matches (content).
if (has_pending && pending_end == gap_start && gap_end > gap_start) {
// Extend pending gap (consecutive delimiters).
result.pending_start = pending_start;
result.pending_end = gap_end;
result.has_pending = true;
} else {
// Emit pending (merged delimiters), emit match (content), buffer gap.
if (has_pending) {
result.segments[0] = pending_start;
result.segments[1] = pending_end;
result.segment_count = 1;
}
if (match_end > match_start) {
result.segments[result.segment_count * 2] = match_start;
result.segments[result.segment_count * 2 + 1] = match_end;
result.segment_count++;
}
if (gap_end > gap_start) {
result.pending_start = gap_start;
result.pending_end = gap_end;
result.has_pending = true;
}
}
} else {
// Normal: merge consecutive matches (delimiters), emit gaps (content).
if (has_pending && pending_end == match_start) {
// Extend pending match (consecutive delimiters).
result.pending_start = pending_start;
result.pending_end = match_end;
result.has_pending = true;
} else {
// Emit pending (merged delimiters), emit gap (content), buffer match.
if (has_pending) {
result.segments[0] = pending_start;
result.segments[1] = pending_end;
result.segment_count = 1;
}
if (gap_end > gap_start) {
result.segments[result.segment_count * 2] = gap_start;
result.segments[result.segment_count * 2 + 1] = gap_end;
result.segment_count++;
}
result.pending_start = match_start;
result.pending_end = match_end;
result.has_pending = true;
}
}
result.last_emit_end = match_end;
return result;
}
//===----------------------------------------------------------------------===//
// Split Processing
//===----------------------------------------------------------------------===//
// Dispatches to the appropriate behavior handler.
IREE_ATTRIBUTE_ALWAYS_INLINE static inline iree_tokenizer_split_match_result_t
iree_tokenizer_split_handle_match(
iree_tokenizer_regex_split_behavior_t behavior, iree_host_size_t gap_start,
iree_host_size_t gap_end, iree_host_size_t match_start,
iree_host_size_t match_end, iree_host_size_t pending_start,
iree_host_size_t pending_end, bool has_pending, bool invert) {
switch (behavior) {
case IREE_TOKENIZER_UTIL_REGEX_SPLIT_REMOVED:
return iree_tokenizer_split_handle_removed(
gap_start, gap_end, match_start, match_end, pending_start,
pending_end, has_pending, invert);
case IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED:
return iree_tokenizer_split_handle_isolated(
gap_start, gap_end, match_start, match_end, pending_start,
pending_end, has_pending, invert);
case IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_PREVIOUS:
return iree_tokenizer_split_handle_merged_with_previous(
gap_start, match_end, pending_start, pending_end, has_pending,
invert);
case IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_NEXT:
return iree_tokenizer_split_handle_merged_with_next(
gap_start, gap_end, match_start, match_end, pending_start,
has_pending, invert);
case IREE_TOKENIZER_UTIL_REGEX_SPLIT_CONTIGUOUS:
return iree_tokenizer_split_handle_contiguous(
gap_start, gap_end, match_start, match_end, pending_start,
pending_end, has_pending, invert);
default: {
IREE_ASSERT(false && "invalid split behavior enum value");
iree_tokenizer_split_match_result_t result = {0};
return result;
}
}
}
typedef struct {
iree_tokenizer_segment_output_t output;
iree_host_size_t chunk_base;
iree_host_size_t count;
// End of last successfully emitted segment.
iree_host_size_t last_end;
bool full;
} iree_tokenizer_split_emitter_t;
// Emits a segment. Empty segments (start >= end) are skipped.
// Sets emitter->full on overflow; caller should check after batch of emits.
static inline void iree_tokenizer_split_emit(
iree_tokenizer_split_emitter_t* emitter, iree_host_size_t start,
iree_host_size_t end, bool absolute) {
if (start >= end || emitter->full) return;
if (emitter->count >= emitter->output.capacity) {
emitter->full = true;
return;
}
iree_host_size_t offset = absolute ? 0 : emitter->chunk_base;
emitter->output.values[emitter->count].start = start - offset;
emitter->output.values[emitter->count].end = end - offset;
emitter->count++;
emitter->last_end = end;
}
// Emits all segments from a match result. Returns true if none overflowed.
static inline bool iree_tokenizer_split_emit_result(
iree_tokenizer_split_emitter_t* emitter,
const iree_tokenizer_split_match_result_t* result, bool absolute) {
for (uint8_t i = 0; i < result->segment_count; ++i) {
iree_tokenizer_split_emit(emitter, result->segments[i * 2],
result->segments[i * 2 + 1], absolute);
}
return !emitter->full;
}
// Context for the inline callback, shared between process() and finalize().
// Holds a transient copy of mutable state that is committed back to the
// persistent state struct after the regex feed/finalize completes.
typedef struct iree_tokenizer_split_callback_context_t {
iree_tokenizer_split_emitter_t* emitter;
// Mutable state (updated per-match on successful emit).
iree_host_size_t last_emit_end;
iree_host_size_t pending_start;
iree_host_size_t pending_end;
bool has_pending;
// Immutable config.
iree_tokenizer_regex_split_behavior_t behavior;
bool invert;
bool absolute;
} iree_tokenizer_split_callback_context_t;
// Processes each regex match inline: computes behavior result, emits segments,
// and commits state. Returns RESOURCE_EXHAUSTED if output capacity is
// insufficient (the match is not committed and the regex will stop).
static inline iree_status_t iree_tokenizer_split_inline_callback(
void* user_data, iree_tokenizer_regex_match_t match) {
iree_tokenizer_split_callback_context_t* context =
(iree_tokenizer_split_callback_context_t*)user_data;
// Compute behavior result (pure, no side effects).
// Gap is the region between the last emit and the current match start.
// Match is the matched region itself.
// The invert flag is passed to handlers so they can swap semantics internally
// (e.g., REMOVED emits match instead of gap when inverted).
iree_tokenizer_split_match_result_t result =
iree_tokenizer_split_handle_match(
context->behavior, context->last_emit_end, match.start, match.start,
match.end, context->pending_start, context->pending_end,
context->has_pending, context->invert);
// Emit segments. If the emitter fills mid-result, the partially-emitted
// segments are committed (emitter.last_end tracks progress), and this match's
// full state changes are NOT committed to context (ensuring re-scan picks
// up the un-emitted segments).
if (!iree_tokenizer_split_emit_result(context->emitter, &result,
context->absolute)) {
return iree_status_from_code(IREE_STATUS_RESOURCE_EXHAUSTED);
}
// All segments emitted successfully. Commit state.
context->last_emit_end = result.last_emit_end;
context->has_pending = result.has_pending;
context->pending_start = result.pending_start;
context->pending_end = result.pending_end;
return iree_ok_status();
}
// Processes input with literal string matching. Returns a status and updates
// emitter/context state. Uses the same callback context structure as regex
// mode for consistency.
//
// The literal matching algorithm:
// 1. If we have a partial match from previous chunk, try to continue it.
// 2. Scan for new matches byte-by-byte.
// 3. Track partial matches at end of chunk that may continue next time.
static iree_status_t iree_tokenizer_split_literal_process(
const iree_tokenizer_segmenter_split_t* segmenter,
iree_tokenizer_segmenter_split_state_t* state, iree_string_view_t input,
iree_host_size_t chunk_base,
iree_tokenizer_split_callback_context_t* context) {
const char* pattern = segmenter->literal.pattern.data;
iree_host_size_t pattern_length = segmenter->literal.pattern.size;
iree_host_size_t scan_position = 0;
// Resume partial match from previous chunk if any.
if (state->literal_match_position > 0) {
iree_host_size_t remaining = pattern_length - state->literal_match_position;
iree_host_size_t can_check =
(remaining < input.size) ? remaining : input.size;
// Check if the partial match continues.
if (memcmp(input.data, pattern + state->literal_match_position,
can_check) == 0) {
if (can_check == remaining) {
// Full match completed! Report the match.
iree_host_size_t match_start = state->literal_match_start;
iree_host_size_t match_end = chunk_base + can_check;
iree_tokenizer_regex_match_t match = {match_start, match_end};
iree_status_t status =
iree_tokenizer_split_inline_callback(context, match);
if (!iree_status_is_ok(status)) {
// Reset partial match state since we're restarting.
state->literal_match_position = 0;
return status;
}
scan_position = can_check;
state->literal_match_position = 0;
} else {
// Still partial - need more input.
state->literal_match_position += can_check;
return iree_ok_status();
}
} else {
// Partial match failed. We need to re-scan from after the false start.
// The bytes from literal_match_start are already consumed by previous
// chunks, so we just reset and continue scanning this chunk.
state->literal_match_position = 0;
}
}
// Scan for new matches.
while (scan_position < input.size) {
// Look for the first character of the pattern.
const char* found = memchr(input.data + scan_position, pattern[0],
input.size - scan_position);
if (!found) {
// No more potential matches in this chunk.
break;
}
iree_host_size_t match_offset = (iree_host_size_t)(found - input.data);
iree_host_size_t remaining_input = input.size - match_offset;
if (remaining_input >= pattern_length) {
// Enough bytes to check full pattern.
if (memcmp(found, pattern, pattern_length) == 0) {
// Full match found.
iree_host_size_t match_start = chunk_base + match_offset;
iree_host_size_t match_end = match_start + pattern_length;
iree_tokenizer_regex_match_t match = {match_start, match_end};
iree_status_t status =
iree_tokenizer_split_inline_callback(context, match);
if (!iree_status_is_ok(status)) {
return status;
}
scan_position = match_offset + pattern_length;
} else {
// Not a match, continue from next byte.
scan_position = match_offset + 1;
}
} else {
// Potential partial match at end of chunk.
if (memcmp(found, pattern, remaining_input) == 0) {
// Partial match - save state for next chunk.
state->literal_match_start = chunk_base + match_offset;
state->literal_match_position = remaining_input;
}
// Either way, we're done with this chunk.
break;
}
}
return iree_ok_status();
}
// Finalizes literal matching. Handles any partial match from the last chunk.
static iree_status_t iree_tokenizer_split_literal_finalize(
const iree_tokenizer_segmenter_split_t* segmenter,
iree_tokenizer_segmenter_split_state_t* state,
iree_string_view_t remaining_input, iree_host_size_t chunk_base,
iree_tokenizer_split_callback_context_t* context) {
const char* pattern = segmenter->literal.pattern.data;
iree_host_size_t pattern_length = segmenter->literal.pattern.size;
// Resume partial match if any.
if (state->literal_match_position > 0) {
iree_host_size_t remaining = pattern_length - state->literal_match_position;
if (remaining_input.size >= remaining &&
memcmp(remaining_input.data, pattern + state->literal_match_position,
remaining) == 0) {
// Full match completed.
iree_host_size_t match_start = state->literal_match_start;
iree_host_size_t match_end = chunk_base + remaining;
iree_tokenizer_regex_match_t match = {match_start, match_end};
iree_status_t status =
iree_tokenizer_split_inline_callback(context, match);
if (!iree_status_is_ok(status)) {
state->literal_match_position = 0;
return status;
}
// Continue scanning after the match.
remaining_input.data += remaining;
remaining_input.size -= remaining;
chunk_base += remaining;
}
state->literal_match_position = 0;
}
// Scan remaining input for matches.
iree_host_size_t scan_position = 0;
while (scan_position < remaining_input.size) {
const char* found = memchr(remaining_input.data + scan_position, pattern[0],
remaining_input.size - scan_position);
if (!found) break;
iree_host_size_t match_offset =
(iree_host_size_t)(found - remaining_input.data);
iree_host_size_t bytes_left = remaining_input.size - match_offset;
if (bytes_left >= pattern_length &&
memcmp(found, pattern, pattern_length) == 0) {
iree_host_size_t match_start = chunk_base + match_offset;
iree_host_size_t match_end = match_start + pattern_length;
iree_tokenizer_regex_match_t match = {match_start, match_end};
iree_status_t status =
iree_tokenizer_split_inline_callback(context, match);
if (!iree_status_is_ok(status)) {
return status;
}
scan_position = match_offset + pattern_length;
} else {
scan_position = match_offset + 1;
}
}
return iree_ok_status();
}
//===----------------------------------------------------------------------===//
// Vtable Implementation
//===----------------------------------------------------------------------===//
static const iree_tokenizer_segmenter_vtable_t
iree_tokenizer_segmenter_split_vtable;
iree_status_t iree_tokenizer_segmenter_split_allocate(
iree_tokenizer_regex_dfa_t dfa, uint8_t* dfa_data,
iree_tokenizer_regex_split_behavior_t behavior, bool invert,
iree_allocator_t allocator, iree_tokenizer_segmenter_t** out_segmenter) {
IREE_TRACE_ZONE_BEGIN(z0);
IREE_ASSERT_ARGUMENT(dfa_data);
IREE_ASSERT_ARGUMENT(out_segmenter);
*out_segmenter = NULL;
iree_tokenizer_segmenter_split_t* segmenter = NULL;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0,
iree_allocator_malloc(allocator, sizeof(*segmenter), (void**)&segmenter));
iree_tokenizer_segmenter_initialize(
&segmenter->base, &iree_tokenizer_segmenter_split_vtable,
sizeof(iree_tokenizer_segmenter_split_state_t));
segmenter->allocator = allocator;
segmenter->behavior = behavior;
segmenter->invert = invert;
segmenter->is_literal = false;
segmenter->regex.dfa = dfa;
segmenter->regex.dfa_data = dfa_data;
// Compute stride acceleration data for the DFA.
iree_status_t status = iree_tokenizer_regex_stride_allocate(
&segmenter->regex.dfa, allocator, &segmenter->regex.stride);
if (iree_status_is_ok(status)) {
*out_segmenter = &segmenter->base;
} else {
iree_allocator_free(allocator, dfa_data);
iree_allocator_free(allocator, segmenter);
}
IREE_TRACE_ZONE_END(z0);
return status;
}
iree_status_t iree_tokenizer_segmenter_split_literal_allocate(
iree_string_view_t pattern, iree_tokenizer_regex_split_behavior_t behavior,
bool invert, iree_allocator_t allocator,
iree_tokenizer_segmenter_t** out_segmenter) {
IREE_TRACE_ZONE_BEGIN(z0);
IREE_ASSERT_ARGUMENT(out_segmenter);
*out_segmenter = NULL;
if (pattern.size == 0) {
IREE_RETURN_AND_END_ZONE(
z0, iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"literal split pattern cannot be empty"));
}
iree_tokenizer_segmenter_split_t* segmenter = NULL;
iree_status_t status =
iree_allocator_malloc(allocator, sizeof(*segmenter), (void**)&segmenter);
// Allocate and copy pattern data.
uint8_t* pattern_data = NULL;
if (iree_status_is_ok(status)) {
status =
iree_allocator_malloc(allocator, pattern.size, (void**)&pattern_data);
}
if (iree_status_is_ok(status)) {
memcpy(pattern_data, pattern.data, pattern.size);
iree_tokenizer_segmenter_initialize(
&segmenter->base, &iree_tokenizer_segmenter_split_vtable,
sizeof(iree_tokenizer_segmenter_split_state_t));
segmenter->allocator = allocator;
segmenter->behavior = behavior;
segmenter->invert = invert;
segmenter->is_literal = true;
segmenter->literal.pattern_data = pattern_data;
segmenter->literal.pattern =
iree_make_string_view((const char*)pattern_data, pattern.size);
*out_segmenter = &segmenter->base;
} else {
iree_allocator_free(allocator, pattern_data);
iree_allocator_free(allocator, segmenter);
}
IREE_TRACE_ZONE_END(z0);
return status;
}
static void iree_tokenizer_segmenter_split_destroy(
iree_tokenizer_segmenter_t* segmenter) {
IREE_TRACE_ZONE_BEGIN(z0);
iree_tokenizer_segmenter_split_t* self =
(iree_tokenizer_segmenter_split_t*)segmenter;
iree_allocator_t allocator = self->allocator;
if (self->is_literal) {
if (self->literal.pattern_data) {
iree_allocator_free(allocator, self->literal.pattern_data);
}
} else {
iree_tokenizer_regex_stride_free(self->regex.stride, allocator);
if (self->regex.dfa_data) {
iree_allocator_free(allocator, self->regex.dfa_data);
}
}
iree_allocator_free(allocator, self);
IREE_TRACE_ZONE_END(z0);
}
static iree_status_t iree_tokenizer_segmenter_split_state_initialize(
const iree_tokenizer_segmenter_t* segmenter, void* storage,
iree_tokenizer_segmenter_state_t** out_state) {
IREE_TRACE_ZONE_BEGIN(z0);
const iree_tokenizer_segmenter_split_t* self =
(const iree_tokenizer_segmenter_split_t*)segmenter;
iree_tokenizer_segmenter_split_state_t* state =
(iree_tokenizer_segmenter_split_state_t*)storage;
memset(state, 0, sizeof(*state));
state->base.segmenter = segmenter;
state->behavior = self->behavior;
state->invert = self->invert;
state->is_literal = self->is_literal;
if (!self->is_literal) {
iree_tokenizer_regex_exec_initialize(&state->regex_state, &self->regex.dfa);
}
*out_state = &state->base;
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
static void iree_tokenizer_segmenter_split_state_deinitialize(
iree_tokenizer_segmenter_state_t* state) {
IREE_TRACE_ZONE_BEGIN(z0);
(void)state;
IREE_TRACE_ZONE_END(z0);
}
static iree_status_t iree_tokenizer_segmenter_split_state_process(
iree_tokenizer_segmenter_state_t* state, iree_string_view_t input,
iree_tokenizer_segment_output_t output, iree_host_size_t* out_consumed,
iree_host_size_t* out_segment_count) {
iree_tokenizer_segmenter_split_state_t* self =
(iree_tokenizer_segmenter_split_state_t*)state;
const iree_tokenizer_segmenter_split_t* segmenter =
(const iree_tokenizer_segmenter_split_t*)state->segmenter;
*out_consumed = 0;
*out_segment_count = 0;
if (input.size == 0 || output.capacity == 0) {
return iree_ok_status();
}
iree_host_size_t chunk_base = self->bytes_processed;
// Set up emitter and callback context. The context holds a transient copy
// of mutable state that the inline callback updates per-match.
iree_tokenizer_split_emitter_t emitter = {
.output = output,
.chunk_base = chunk_base,
.count = 0,
.last_end = self->last_emit_end,
.full = false,
};
iree_tokenizer_split_callback_context_t context = {
.emitter = &emitter,
.last_emit_end = self->last_emit_end,
.pending_start = self->pending_start,
.pending_end = self->pending_end,
.has_pending = self->has_pending,
.behavior = self->behavior,
.invert = self->invert,
.absolute = false,
};
// Feed pattern matcher; matches are processed inline via callback.
iree_status_t feed_status;
if (self->is_literal) {
feed_status = iree_tokenizer_split_literal_process(segmenter, self, input,
chunk_base, &context);
} else {
feed_status = iree_tokenizer_regex_exec_feed(
&segmenter->regex.dfa, &self->regex_state, input, chunk_base,
segmenter->regex.stride, iree_tokenizer_split_inline_callback,
&context);
}
if (!iree_status_is_ok(feed_status) &&
!iree_status_is_resource_exhausted(feed_status)) {
return feed_status;
}
if (iree_status_is_resource_exhausted(feed_status)) {
emitter.full = true;
}
iree_status_ignore(feed_status);
// Determine consumption and commit state based on output capacity.
if (emitter.full && emitter.last_end > chunk_base) {
// Output filled mid-chunk: consume up to last emitted segment.
// Discard any pending state accumulated after the last emit — those bytes
// are beyond our consumption point and will be re-fed on the next call.
*out_consumed = emitter.last_end - chunk_base;
self->bytes_processed = emitter.last_end;
self->last_emit_end = emitter.last_end;
self->has_pending = false;
self->literal_match_position = 0;
if (!self->is_literal) {
iree_tokenizer_regex_exec_initialize(&self->regex_state,
&segmenter->regex.dfa);
}
} else if (emitter.full) {
// Output filled but no progress (capacity too small for first result).
// Leave persistent state unchanged — nothing was consumed or emitted.
*out_consumed = 0;
self->literal_match_position = 0;
if (!self->is_literal) {
iree_tokenizer_regex_exec_initialize(&self->regex_state,
&segmenter->regex.dfa);
}
} else {
// Normal case: consume all input, commit full callback state.
self->bytes_processed = chunk_base + input.size;
self->last_emit_end = context.last_emit_end;
self->has_pending = context.has_pending;
self->pending_start = context.pending_start;
self->pending_end = context.pending_end;
// Check if the regex has a pending match that finalize() will need to emit.
// A pending regex match exists when in_match is true AND there's an
// accepting position (either lookahead-passed or fallback).
//
// CRITICAL: When there's a pending regex match, we must NOT consume past
// last_emit_end. The regex tracks match positions cumulatively from init,
// but finalize() uses chunk_base=bytes_processed for position arithmetic.
// If we consume past the pending match's start position, finalize() will
// compute start - chunk_base = underflow.
//
// Example: Input " if" matches [0,3) and [3,6). The [3,6) match is
// pending after process() (greedy match waits for confirming byte).
// If we set bytes_processed=6, finalize() gets chunk_base=6 and computes
// 3 - 6 = underflow. By capping at last_emit_end=3, finalize() gets
// chunk_base=3 and re-scans " if", producing the correct [0,3) relative
// position (absolute [3,6)).
bool has_pending_regex_match =
!self->is_literal && self->regex_state.in_match &&
(self->regex_state.has_accept || self->regex_state.has_accept_fallback);
// Check if we should use wraparound arithmetic for this behavior.
// ISOLATED+invert needs wraparound because trailing gaps must be emitted
// and has_pending relies on last_emit_end < bytes_processed.
bool uses_wraparound =
self->invert &&
self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED;
// Determine if we need to cap consumed bytes:
// 1. Pending regex match: ALWAYS cap to avoid position underflow
// 2. Trailing data: cap when not handled by pending segment logic
bool has_trailing_data = self->last_emit_end < self->bytes_processed;
bool should_cap_trailing =
!self->has_pending && !uses_wraparound && has_trailing_data;
if (has_pending_regex_match || should_cap_trailing) {
// Cap consumed at last emitted segment. The unconsumed bytes will be
// passed to finalize(), which can re-scan them with correct chunk_base.
*out_consumed = self->last_emit_end - chunk_base;
self->bytes_processed = self->last_emit_end;
self->literal_match_position = 0;
if (!self->is_literal) {
iree_tokenizer_regex_exec_initialize(&self->regex_state,
&segmenter->regex.dfa);
}
// Set deferred flag only when capping specifically for the pending regex
// match case that previously used wraparound arithmetic. This preserves
// has_pending()=true for ISOLATED+invert mode, which relies on it.
// For other cases (should_cap_trailing), the original behavior was
// has_pending()=false after capping.
self->deferred_to_finalize = has_pending_regex_match && uses_wraparound;
} else {
*out_consumed = input.size;
self->deferred_to_finalize = false;
}
}
*out_segment_count = emitter.count;
return iree_ok_status();
}
static iree_status_t iree_tokenizer_segmenter_split_state_finalize(
iree_tokenizer_segmenter_state_t* state, iree_string_view_t remaining_input,
iree_tokenizer_segment_output_t output,
iree_host_size_t* out_segment_count) {
iree_tokenizer_segmenter_split_state_t* self =
(iree_tokenizer_segmenter_split_state_t*)state;
const iree_tokenizer_segmenter_split_t* segmenter =
(const iree_tokenizer_segmenter_split_t*)state->segmenter;
*out_segment_count = 0;
iree_host_size_t chunk_base = self->bytes_processed;
// Set up emitter and callback context (same pattern as process).
iree_tokenizer_split_emitter_t emitter = {
.output = output,
.chunk_base = chunk_base,
.count = 0,
.last_end = self->last_emit_end,
.full = false,
};
iree_tokenizer_split_callback_context_t context = {
.emitter = &emitter,
.last_emit_end = self->last_emit_end,
.pending_start = self->pending_start,
.pending_end = self->pending_end,
.has_pending = self->has_pending,
.behavior = self->behavior,
.invert = self->invert,
.absolute = false,
};
iree_host_size_t input_end = chunk_base + remaining_input.size;
// Phase 1: Feed remaining input and finalize pattern matcher. This phase
// runs once; on re-entrant calls (after trailing flush overflow), it is
// skipped.
if (!self->finalize_feed_done) {
if (self->is_literal) {
// Literal mode: finalize handles partial matches and remaining input.
iree_status_t finalize_status = iree_tokenizer_split_literal_finalize(
segmenter, self, remaining_input, chunk_base, &context);
if (iree_status_is_resource_exhausted(finalize_status)) {
// Output full - translate to emitter.full flag.
iree_status_ignore(finalize_status);
emitter.full = true;
} else if (!iree_status_is_ok(finalize_status)) {
return finalize_status;
}
} else {
// Regex mode: feed remaining input through regex.
if (remaining_input.size > 0) {
iree_status_t feed_status = iree_tokenizer_regex_exec_feed(
&segmenter->regex.dfa, &self->regex_state, remaining_input,
chunk_base, segmenter->regex.stride,
iree_tokenizer_split_inline_callback, &context);
if (iree_status_is_resource_exhausted(feed_status)) {
// Output full during feed - preserve regex state for re-entry.
iree_status_ignore(feed_status);
emitter.full = true;
} else if (!iree_status_is_ok(feed_status)) {
return feed_status;
}
}
// Finalize regex (may produce one final match at end-of-input).
// Skip if output already full.
if (!emitter.full) {
iree_status_t finalize_status = iree_tokenizer_regex_exec_finalize(
&segmenter->regex.dfa, &self->regex_state, input_end,
iree_tokenizer_split_inline_callback, &context);
if (iree_status_is_ok(finalize_status)) {
iree_tokenizer_regex_exec_initialize(&self->regex_state,
&segmenter->regex.dfa);
} else if (iree_status_is_resource_exhausted(finalize_status)) {
// Final match couldn't be emitted - preserve regex state for
// re-entry.
iree_status_ignore(finalize_status);
emitter.full = true;
} else {
return finalize_status;
}
}
}
// Commit progress. When full, commit partial progress so re-entry doesn't
// re-emit. When complete, commit full progress and mark phase done.
if (emitter.full) {
self->last_emit_end = emitter.last_end;
} else {
self->last_emit_end = context.last_emit_end;
self->has_pending = context.has_pending;
self->pending_start = context.pending_start;
self->pending_end = context.pending_end;
self->finalize_feed_done = true;
self->deferred_to_finalize = false; // Deferred regex work is done.
}
}
// Phase 2: Flush pending and trailing gap. Each emit may overflow, so state
// updates are guarded per-emit. On re-entry, self's state reflects what was
// already emitted (has_pending=false after pending emitted, last_emit_end
// advanced after each segment).
//
// Trailing gap emission depends on mode:
// - Normal mode: always emit trailing gap (it's content)
// - Invert mode + ISOLATED: emit trailing gap (ISOLATED emits everything)
// - Invert mode + others: skip trailing gap (it's a delimiter)
bool emit_trailing_gap =
!self->invert ||
self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED;
iree_host_size_t trailing_end = input_end;
if (!emitter.full && self->has_pending) {
if (self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_NEXT) {
// MERGED_WITH_NEXT: merge pending delimiter with trailing content.
// In invert mode, pending is a delimiter (gap), trailing is content.
// Only merge if there's trailing content OR emit_trailing_gap is true.
if (emit_trailing_gap || trailing_end > self->pending_end) {
iree_tokenizer_split_emit(&emitter, self->pending_start, trailing_end,
false);
} else {
// Invert mode: pending is delimiter with no content to merge, discard.
iree_tokenizer_split_emit(&emitter, self->pending_start,
self->pending_end, false);
}
if (!emitter.full) {
self->has_pending = false;
self->last_emit_end = trailing_end;
}
} else {
iree_tokenizer_split_emit(&emitter, self->pending_start,
self->pending_end, false);
if (!emitter.full) {
self->has_pending = false;
self->last_emit_end = self->pending_end;
if (emit_trailing_gap) {
iree_tokenizer_split_emit(&emitter, self->last_emit_end, trailing_end,
false);
}
if (!emitter.full) {
self->last_emit_end = trailing_end;
}
}
}
} else if (!emitter.full) {
if (emit_trailing_gap) {
iree_tokenizer_split_emit(&emitter, self->last_emit_end, trailing_end,
false);
}
if (!emitter.full) {
self->last_emit_end = trailing_end;
}
}
*out_segment_count = emitter.count;
if (emitter.full) {
// Commit partial progress so re-entry doesn't re-emit. Caller should check
// has_pending() to know if re-entry is needed.
self->last_emit_end = emitter.last_end;
return iree_ok_status();
}
// All phases complete. Advance bytes_processed and reset for potential reuse.
self->bytes_processed = input_end;
self->finalize_feed_done = false;
return iree_ok_status();
}
static bool iree_tokenizer_segmenter_split_state_has_pending(
const iree_tokenizer_segmenter_state_t* state) {
const iree_tokenizer_segmenter_split_state_t* self =
(const iree_tokenizer_segmenter_split_state_t*)state;
// Check if there's a buffered pending segment.
if (self->has_pending) return true;
// Check if process() capped consumed due to a pending regex match.
// The regex state was reset, but finalize() will re-scan the unconsumed
// bytes to emit the deferred match.
if (self->deferred_to_finalize) return true;
// Check if the regex has a partial match that finalize() will complete.
// This is critical for patterns like \w+ where the match extends to end of
// input but the regex stays in "in_match" state waiting for more input.
if (!self->is_literal && self->regex_state.in_match &&
(self->regex_state.has_accept || self->regex_state.has_accept_fallback)) {
return true;
}
// Check if finalize() will emit trailing data.
// In normal mode, trailing gaps (content) are emitted.
// In invert mode, trailing gaps (delimiters) are only emitted for ISOLATED.
if (self->last_emit_end < self->bytes_processed) {
bool emit_trailing =
!self->invert ||
self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED;
return emit_trailing;
}
return false;
}
static iree_status_t iree_tokenizer_segmenter_split_state_flush(
iree_tokenizer_segmenter_state_t* state,
iree_tokenizer_segment_output_t output, iree_host_size_t* out_segment_count,
iree_host_size_t* out_bytes_committed) {
iree_tokenizer_segmenter_split_state_t* self =
(iree_tokenizer_segmenter_split_state_t*)state;
const iree_tokenizer_segmenter_split_t* segmenter =
(const iree_tokenizer_segmenter_split_t*)state->segmenter;
*out_segment_count = 0;
*out_bytes_committed = self->last_emit_end;
if (output.capacity == 0) {
return iree_ok_status();
}
iree_tokenizer_split_emitter_t emitter = {
.output = output,
.chunk_base = 0, // Segments are absolute buffer positions.
.count = 0,
.last_end = self->last_emit_end,
.full = false,
};
// If we have an accepted match, emit it first.
if (self->regex_state.in_match &&
(self->regex_state.has_accept || self->regex_state.has_accept_fallback)) {
iree_host_size_t match_end = self->regex_state.has_accept
? self->regex_state.last_accept
: self->regex_state.last_accept_fallback;
// Gap is between last emit and match start, match is the regex match.
// Invert flag is passed to handler to swap semantics internally.
iree_tokenizer_split_match_result_t result =
iree_tokenizer_split_handle_match(
self->behavior, self->last_emit_end, self->regex_state.match_start,
self->regex_state.match_start, match_end, self->pending_start,
self->pending_end, self->has_pending, self->invert);
if (!iree_tokenizer_split_emit_result(&emitter, &result, true)) {
*out_segment_count = emitter.count;
*out_bytes_committed = emitter.last_end;
return iree_ok_status();
}
self->last_emit_end = result.last_emit_end;
self->has_pending = result.has_pending;
self->pending_start = result.pending_start;
self->pending_end = result.pending_end;
}
// Emit trailing gap up to bytes_processed, like finalize does.
// This ensures we commit all scanned bytes so the buffer can compact.
// State updates are guarded per-emit for correctness of out_bytes_committed.
//
// Trailing gap emission depends on mode (same logic as finalize).
bool emit_trailing_gap =
!self->invert ||
self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED;
iree_host_size_t trailing_end = self->bytes_processed;
if (!emitter.full && trailing_end > self->last_emit_end) {
if (self->has_pending) {
if (self->behavior == IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_NEXT) {
if (emit_trailing_gap || trailing_end > self->pending_end) {
iree_tokenizer_split_emit(&emitter, self->pending_start, trailing_end,
false);
} else {
iree_tokenizer_split_emit(&emitter, self->pending_start,
self->pending_end, false);
}
if (!emitter.full) {
self->has_pending = false;
self->last_emit_end = trailing_end;
}
} else {
iree_tokenizer_split_emit(&emitter, self->pending_start,
self->pending_end, false);
if (!emitter.full) {
self->has_pending = false;
self->last_emit_end = self->pending_end;
if (emit_trailing_gap) {
iree_tokenizer_split_emit(&emitter, self->last_emit_end,
trailing_end, false);
}
if (!emitter.full) {
self->last_emit_end = trailing_end;
}
}
}
} else {
if (emit_trailing_gap) {
iree_tokenizer_split_emit(&emitter, self->last_emit_end, trailing_end,
false);
}
if (!emitter.full) {
self->last_emit_end = trailing_end;
}
}
}
// Check if output is full BEFORE resetting state. This matches finalize()'s
// pattern: only reset state when all emissions succeeded. If full, return
// partial results and preserve state so caller can retry with more capacity.
*out_segment_count = emitter.count;
*out_bytes_committed = self->last_emit_end;
if (emitter.full) {
return iree_ok_status();
}
// All emissions succeeded. Reset pattern matcher state for fresh input.
if (self->is_literal) {
self->literal_match_position = 0;
} else {
iree_tokenizer_regex_exec_initialize(&self->regex_state,
&segmenter->regex.dfa);
}
// Reset position tracking to 0. After flush, the tokenizer compacts the
// buffer and refills from position 0, so our next process() call will
// receive a fresh chunk at offset 0.
self->bytes_processed = 0;
self->last_emit_end = 0;
return iree_ok_status();
}
static const iree_tokenizer_segmenter_vtable_t
iree_tokenizer_segmenter_split_vtable = {
.destroy = iree_tokenizer_segmenter_split_destroy,
.state_initialize = iree_tokenizer_segmenter_split_state_initialize,
.state_deinitialize = iree_tokenizer_segmenter_split_state_deinitialize,
.state_process = iree_tokenizer_segmenter_split_state_process,
.state_finalize = iree_tokenizer_segmenter_split_state_finalize,
.state_has_pending = iree_tokenizer_segmenter_split_state_has_pending,
.state_flush = iree_tokenizer_segmenter_split_state_flush,
};