blob: 8a67a8fa00a7d4b513bbd0c80826c9cd194b0e4a [file]
// Copyright 2026 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Fuzz target for Punctuation segmenter.
//
// Tests the punctuation segmenter's robustness against:
// - ASCII punctuation characters
// - Unicode punctuation (category P)
// - Long runs of punctuation
// - Mixed punctuation and text
// - All split behaviors (ISOLATED, REMOVED, MERGED_*, CONTIGUOUS)
// - Invalid UTF-8 sequences
// - Chunk boundary handling for multi-byte punctuation
//
// See https://iree.dev/developers/debugging/fuzzing/ for build and run info.
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include "iree/base/api.h"
#include "iree/tokenizer/segmenter.h"
#include "iree/tokenizer/segmenter/punctuation.h"
static void process_with_chunk_size(iree_tokenizer_segmenter_t* segmenter,
const uint8_t* data, size_t size,
size_t chunk_size) {
iree_host_size_t state_size = iree_tokenizer_segmenter_state_size(segmenter);
if (state_size == 0 || state_size > 64 * 1024) return;
void* state_buffer = malloc(state_size);
if (!state_buffer) return;
iree_tokenizer_segmenter_state_t* state = NULL;
iree_status_t status = iree_tokenizer_segmenter_state_initialize(
segmenter, state_buffer, &state);
if (!iree_status_is_ok(status)) {
iree_status_ignore(status);
free(state_buffer);
return;
}
iree_tokenizer_segment_t segments[64];
iree_tokenizer_segment_output_t output =
iree_tokenizer_make_segment_output(segments, 64);
size_t offset = 0;
size_t current_chunk_size = chunk_size;
while (offset < size) {
size_t remaining = size - offset;
size_t this_chunk =
remaining < current_chunk_size ? remaining : current_chunk_size;
iree_string_view_t input_chunk = iree_make_string_view(
reinterpret_cast<const char*>(data + offset), this_chunk);
iree_host_size_t consumed = 0;
iree_host_size_t segment_count = 0;
status = iree_tokenizer_segmenter_state_process(state, input_chunk, output,
&consumed, &segment_count);
if (iree_status_is_resource_exhausted(status)) {
iree_status_ignore(status);
output = iree_tokenizer_make_segment_output(segments, 64);
} else if (!iree_status_is_ok(status)) {
iree_status_ignore(status);
break;
}
if (consumed == 0 && segment_count == 0) {
if (current_chunk_size >= remaining) break;
current_chunk_size = current_chunk_size * 2 < remaining
? current_chunk_size * 2
: remaining;
continue;
}
current_chunk_size = chunk_size;
offset += consumed;
}
iree_string_view_t remaining_input = iree_make_string_view(
reinterpret_cast<const char*>(data + offset), size - offset);
iree_host_size_t final_segment_count = 0;
status = iree_tokenizer_segmenter_state_finalize(
state, remaining_input, output, &final_segment_count);
iree_status_ignore(status);
iree_tokenizer_segmenter_state_deinitialize(state);
free(state_buffer);
}
static void test_with_behavior(const uint8_t* data, size_t size,
iree_tokenizer_regex_split_behavior_t behavior) {
iree_tokenizer_segmenter_t* segmenter = NULL;
iree_status_t status = iree_tokenizer_segmenter_punctuation_allocate(
behavior, iree_allocator_system(), &segmenter);
if (!iree_status_is_ok(status)) {
iree_status_ignore(status);
return;
}
process_with_chunk_size(segmenter, data, size, 1);
process_with_chunk_size(segmenter, data, size, 5);
process_with_chunk_size(segmenter, data, size, size);
iree_tokenizer_segmenter_free(segmenter);
}
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
if (size > 16 * 1024) size = 16 * 1024;
// Test all split behaviors.
test_with_behavior(data, size, IREE_TOKENIZER_UTIL_REGEX_SPLIT_ISOLATED);
test_with_behavior(data, size, IREE_TOKENIZER_UTIL_REGEX_SPLIT_REMOVED);
test_with_behavior(data, size,
IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_PREVIOUS);
test_with_behavior(data, size,
IREE_TOKENIZER_UTIL_REGEX_SPLIT_MERGED_WITH_NEXT);
test_with_behavior(data, size, IREE_TOKENIZER_UTIL_REGEX_SPLIT_CONTIGUOUS);
return 0;
}