blob: 6ea23daaacffd527c3cc414dc24425c4c791a8b6 [file]
// Copyright 2026 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Fuzz target for Split segmenter.
//
// Tests the regex-based split segmenter's robustness against:
// - Various regex patterns
// - All split behaviors (ISOLATED, REMOVED, MERGED_*, CONTIGUOUS)
// - Invert mode (pattern matches tokens vs delimiters)
// - Pattern matches at chunk boundaries
// - Empty segments from consecutive delimiters
// - Invalid UTF-8 in input
// - Streaming with various chunk sizes
//
// Note: This fuzzer tests with a fixed set of safe regex patterns to avoid
// spending fuzz time on regex compilation failures. The regex_fuzz target
// covers regex compilation edge cases separately.
//
// See https://iree.dev/developers/debugging/fuzzing/ for build and run info.
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include "iree/base/api.h"
#include "iree/tokenizer/regex/compile.h"
#include "iree/tokenizer/segmenter.h"
#include "iree/tokenizer/segmenter/split.h"
// Pre-compiled patterns for testing (compiled once at startup).
static iree_tokenizer_segmenter_t* g_segmenters[10] = {NULL};
static size_t g_segmenter_count = 0;
// Helper to compile a pattern and create a segmenter.
static iree_tokenizer_segmenter_t* compile_segmenter(
const char* pattern, iree_tokenizer_regex_split_behavior_t behavior,
bool invert) {
// Compile pattern to binary DFA data.
uint8_t* dfa_data = NULL;
iree_host_size_t dfa_size = 0;
iree_tokenizer_regex_compile_error_t error = {0};
iree_status_t status = iree_tokenizer_regex_compile(
iree_make_cstring_view(pattern),
IREE_TOKENIZER_UTIL_REGEX_COMPILE_FLAG_NONE, iree_allocator_system(),
&dfa_data, &dfa_size, &error);
if (!iree_status_is_ok(status)) {
iree_status_ignore(status);
return NULL;
}
// Load the DFA from binary data.
iree_tokenizer_regex_dfa_t dfa = {0};
status = iree_tokenizer_regex_dfa_load(
iree_make_const_byte_span(dfa_data, dfa_size), &dfa);
if (!iree_status_is_ok(status)) {
iree_status_ignore(status);
iree_allocator_free(iree_allocator_system(), dfa_data);
return NULL;
}
// Create segmenter (takes ownership of dfa_data on success).
iree_tokenizer_segmenter_t* segmenter = NULL;
status = iree_tokenizer_segmenter_split_allocate(
dfa, dfa_data, behavior, invert, iree_allocator_system(), &segmenter);
if (!iree_status_is_ok(status)) {
iree_status_ignore(status);
iree_allocator_free(iree_allocator_system(), dfa_data);
return NULL;
}
return segmenter;
}
extern "C" int LLVMFuzzerInitialize(int* argc, char*** argv) {
(void)argc;
(void)argv;
// Compile a variety of useful patterns with different behaviors.
// Pattern 1: Whitespace splitting (common use case).
g_segmenters[g_segmenter_count++] =
compile_segmenter("\\s+", IREE_TOKENIZER_UTIL_REGEX_SPLIT_REMOVED, false);
// Pattern 2: Word matching (GPT-2 style, invert mode).
g_segmenters[g_segmenter_count++] = compile_segmenter(
"[a-zA-Z]+", IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED, true);
// Pattern 3: Punctuation as delimiter.
g_segmenters[g_segmenter_count++] = compile_segmenter(
"[.,!?;:]", IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED, false);
// Pattern 4: Merged with previous.
g_segmenters[g_segmenter_count++] = compile_segmenter(
" ", IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_PREVIOUS, false);
// Pattern 5: Merged with next.
g_segmenters[g_segmenter_count++] = compile_segmenter(
" ", IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_NEXT, false);
// Pattern 6: Contiguous.
g_segmenters[g_segmenter_count++] =
compile_segmenter("-", IREE_TOKENIZER_UTIL_REGEX_SPLIT_CONTIGUOUS, false);
return 0;
}
static void process_with_chunk_size(iree_tokenizer_segmenter_t* segmenter,
const uint8_t* data, size_t size,
size_t chunk_size) {
if (!segmenter) return;
iree_host_size_t state_size = iree_tokenizer_segmenter_state_size(segmenter);
if (state_size == 0 || state_size > 64 * 1024) return;
void* state_buffer = malloc(state_size);
if (!state_buffer) return;
iree_tokenizer_segmenter_state_t* state = NULL;
iree_status_t status = iree_tokenizer_segmenter_state_initialize(
segmenter, state_buffer, &state);
if (!iree_status_is_ok(status)) {
iree_status_ignore(status);
free(state_buffer);
return;
}
iree_tokenizer_segment_t segments[64];
iree_tokenizer_segment_output_t output =
iree_tokenizer_make_segment_output(segments, 64);
size_t offset = 0;
size_t current_chunk_size = chunk_size;
while (offset < size) {
size_t remaining = size - offset;
size_t this_chunk =
remaining < current_chunk_size ? remaining : current_chunk_size;
iree_string_view_t input_chunk = iree_make_string_view(
reinterpret_cast<const char*>(data + offset), this_chunk);
iree_host_size_t consumed = 0;
iree_host_size_t segment_count = 0;
status = iree_tokenizer_segmenter_state_process(state, input_chunk, output,
&consumed, &segment_count);
if (iree_status_is_resource_exhausted(status)) {
iree_status_ignore(status);
output = iree_tokenizer_make_segment_output(segments, 64);
} else if (!iree_status_is_ok(status)) {
iree_status_ignore(status);
break;
}
if (consumed == 0 && segment_count == 0) {
if (current_chunk_size >= remaining) break;
current_chunk_size = current_chunk_size * 2 < remaining
? current_chunk_size * 2
: remaining;
continue;
}
current_chunk_size = chunk_size;
offset += consumed;
}
iree_string_view_t remaining_input = iree_make_string_view(
reinterpret_cast<const char*>(data + offset), size - offset);
iree_host_size_t final_segment_count = 0;
status = iree_tokenizer_segmenter_state_finalize(
state, remaining_input, output, &final_segment_count);
iree_status_ignore(status);
iree_tokenizer_segmenter_state_deinitialize(state);
free(state_buffer);
}
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
if (size > 16 * 1024) size = 16 * 1024;
// Test each pre-compiled segmenter with different chunk sizes.
for (size_t i = 0; i < g_segmenter_count; ++i) {
if (g_segmenters[i]) {
process_with_chunk_size(g_segmenters[i], data, size, 1);
process_with_chunk_size(g_segmenters[i], data, size, 7);
process_with_chunk_size(g_segmenters[i], data, size, size);
}
}
return 0;
}