blob: 535685a8e756c1c8cc68a0af4819d0910ab38435 [file]
// Copyright 2026 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Normalizer interface for tokenizer preprocessing.
//
// Normalizers transform text before segmentation: NFC normalization,
// lowercasing, stripping, etc. Each normalizer type implements the vtable
// interface for pull-based streaming processing.
//
// Design principles:
// - Pull-based: caller controls flow via output buffer size
// - Batched: output buffer size determines processing batch size
// - Zero allocation: state lives in caller-provided storage
// - Composable: Sequence normalizer chains multiple normalizers
#ifndef IREE_TOKENIZER_NORMALIZER_H_
#define IREE_TOKENIZER_NORMALIZER_H_
#include "iree/base/api.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct iree_tokenizer_normalizer_t iree_tokenizer_normalizer_t;
typedef struct iree_tokenizer_normalizer_state_t
iree_tokenizer_normalizer_state_t;
typedef struct iree_tokenizer_normalizer_vtable_t
iree_tokenizer_normalizer_vtable_t;
//===----------------------------------------------------------------------===//
// Normalizer Flags
//===----------------------------------------------------------------------===//
// Flags passed to state_process() to communicate segment boundary information.
// This enables the tokenizer to signal when the normalizer should treat the
// current input as the end of a logical segment (e.g., before a special token).
typedef uint32_t iree_tokenizer_normalizer_flags_t;
enum iree_tokenizer_normalizer_flag_bits_e {
IREE_TOKENIZER_NORMALIZER_FLAG_NONE = 0,
// This is the last input for the current logical segment (special token
// boundary, stream end, etc.). Normalizers should:
// - Treat "maybe trailing" content as actually trailing
// - Flush any buffered content that was waiting for lookahead
// - Reset internal state for the next segment
//
// This flag prevents deadlocks when normalizers use lazy consumption. For
// example, the Strip normalizer doesn't consume trailing whitespace until it
// sees non-whitespace (to determine if it's intermediate or trailing). When
// the tokenizer limits input at a special token boundary, the normalizer
// might see only whitespace with no way to look ahead. This flag signals
// that no more input will arrive for this segment, so the whitespace is
// definitely trailing.
IREE_TOKENIZER_NORMALIZER_FLAG_SEGMENT_END = 1u << 0,
// Position 0 of the original input was consumed by a special token. This
// signals to the prepend normalizer that it should skip prepending for this
// segment even though it's the first text the normalizer sees.
//
// For example, with prepend_scheme="first" and input "<s>hello":
// - Special token "<s>" matches at position 0
// - Text "hello" follows, but it's NOT at the "first" position
// - Without this flag, prepend would emit "▁hello" (incorrect)
// - With this flag, prepend knows to skip, emitting "hello" (correct)
IREE_TOKENIZER_NORMALIZER_FLAG_FIRST_CONSUMED = 1u << 1,
};
//===----------------------------------------------------------------------===//
// Normalizer Base Type
//===----------------------------------------------------------------------===//
// Base normalizer structure. All concrete normalizer types embed this at
// offset 0. The vtable provides type-specific operations; state_size is
// cached at creation time for efficient state allocation.
struct iree_tokenizer_normalizer_t {
const iree_tokenizer_normalizer_vtable_t* vtable; // Must be at offset 0.
// Size of state struct, cached at creation.
iree_host_size_t state_size;
};
// Base streaming state structure. All concrete state types embed this at
// offset 0. The normalizer back-pointer enables vtable dispatch from state.
struct iree_tokenizer_normalizer_state_t {
const iree_tokenizer_normalizer_t* normalizer;
};
//===----------------------------------------------------------------------===//
// Normalizer VTable
//===----------------------------------------------------------------------===//
// VTable for normalizer implementations.
//
// Pull-based processing model:
// - state_process() consumes input and writes to output
// - Caller controls batch size via output buffer capacity
// - Incomplete data (e.g., partial UTF-8) buffered in state
// - state_finalize() flushes remaining buffered data
struct iree_tokenizer_normalizer_vtable_t {
void (*destroy)(iree_tokenizer_normalizer_t* normalizer);
iree_status_t (*state_initialize)(
const iree_tokenizer_normalizer_t* normalizer, void* storage,
iree_tokenizer_normalizer_state_t** out_state);
void (*state_deinitialize)(iree_tokenizer_normalizer_state_t* state);
iree_status_t (*state_process)(iree_tokenizer_normalizer_state_t* state,
iree_string_view_t input,
iree_mutable_string_view_t output,
iree_tokenizer_normalizer_flags_t flags,
iree_host_size_t* IREE_RESTRICT out_consumed,
iree_host_size_t* IREE_RESTRICT out_written);
iree_status_t (*state_finalize)(iree_tokenizer_normalizer_state_t* state,
iree_mutable_string_view_t output,
iree_host_size_t* IREE_RESTRICT out_written);
bool (*state_has_pending)(const iree_tokenizer_normalizer_state_t* state);
};
//===----------------------------------------------------------------------===//
// Normalizer Public API
//===----------------------------------------------------------------------===//
// Initializes base normalizer fields. Called by concrete implementations
// after allocating the normalizer struct.
void iree_tokenizer_normalizer_initialize(
iree_tokenizer_normalizer_t* normalizer,
const iree_tokenizer_normalizer_vtable_t* vtable,
iree_host_size_t state_size);
// Frees a normalizer. Safe to call with NULL.
void iree_tokenizer_normalizer_free(iree_tokenizer_normalizer_t* normalizer);
// Returns the size of state storage required for this normalizer.
static inline iree_host_size_t iree_tokenizer_normalizer_state_size(
const iree_tokenizer_normalizer_t* normalizer) {
return normalizer->state_size;
}
// Initializes streaming state in caller-provided storage.
// |storage| must be at least iree_tokenizer_normalizer_state_size() bytes.
static inline iree_status_t iree_tokenizer_normalizer_state_initialize(
const iree_tokenizer_normalizer_t* normalizer, void* storage,
iree_tokenizer_normalizer_state_t** out_state) {
return normalizer->vtable->state_initialize(normalizer, storage, out_state);
}
// Deinitializes streaming state (does not free storage).
static inline void iree_tokenizer_normalizer_state_deinitialize(
iree_tokenizer_normalizer_state_t* state) {
if (state && state->normalizer) {
state->normalizer->vtable->state_deinitialize(state);
}
}
// Processes input text, writing normalized output.
//
// Reads up to |input.size| bytes and writes up to |output.size| bytes.
// |flags| communicates segment boundary information from the tokenizer.
// Returns bytes consumed in |out_consumed| and bytes written in |out_written|.
//
// May consume less than the full input if the output buffer is full or if the
// normalizer is waiting for more input to complete a normalization unit (e.g.,
// combining characters for NFC composition). Partial UTF-8 sequences are not
// handled here — the caller is responsible for ensuring input chunks end on
// codepoint boundaries.
//
// When IREE_TOKENIZER_NORMALIZER_FLAG_SEGMENT_END is set, this is the last
// input for the current logical segment. Normalizers should finalize any
// pending state without waiting for more lookahead (e.g., treat potential
// trailing content as actually trailing) and reset for the next segment.
static inline iree_status_t iree_tokenizer_normalizer_state_process(
iree_tokenizer_normalizer_state_t* state, iree_string_view_t input,
iree_mutable_string_view_t output, iree_tokenizer_normalizer_flags_t flags,
iree_host_size_t* IREE_RESTRICT out_consumed,
iree_host_size_t* IREE_RESTRICT out_written) {
return state->normalizer->vtable->state_process(state, input, output, flags,
out_consumed, out_written);
}
// Finalizes the stream, flushing all buffered data.
// Call after all input has been provided via state_process().
// May need multiple calls if output buffer is too small for buffered data.
static inline iree_status_t iree_tokenizer_normalizer_state_finalize(
iree_tokenizer_normalizer_state_t* state, iree_mutable_string_view_t output,
iree_host_size_t* IREE_RESTRICT out_written) {
return state->normalizer->vtable->state_finalize(state, output, out_written);
}
// Returns true if state has pending data that would produce output on
// finalize. Useful for checking if more finalize() calls are needed.
static inline bool iree_tokenizer_normalizer_state_has_pending(
const iree_tokenizer_normalizer_state_t* state) {
return state->normalizer->vtable->state_has_pending(state);
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // IREE_TOKENIZER_NORMALIZER_H_