blob: 174e2a155d45514af4c690638389d16be1749365 [file]
// Copyright 2026 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// HuggingFace JSON format tests for tokenizer.
//
// These tests load tokenizers from embedded HuggingFace JSON configs and verify
// correct behavior. They complement the builder-API tests in
// tokenizer_encode_test.cc by testing the full JSON loading pipeline with
// realistic configurations.
//
// Buffer/chunk invariance tests verify that tokenizer produces identical
// output across different transform buffer sizes and chunk feeding patterns.
#include <cstring>
#include <string>
#include <vector>
#include "iree/base/api.h"
#include "iree/testing/gtest.h"
#include "iree/testing/status_matchers.h"
#include "iree/tokenizer/format/huggingface/tokenizer_json.h"
#include "iree/tokenizer/testdata/streaming_testdata.h"
#include "iree/tokenizer/tokenizer.h"
namespace iree::tokenizer {
namespace {
//===----------------------------------------------------------------------===//
// Test Utilities
//===----------------------------------------------------------------------===//
static iree_string_view_t GetEmbeddedFile(const char* name) {
const struct iree_file_toc_t* toc =
iree_tokenizer_streaming_testdata_create();
for (size_t i = 0; i < iree_tokenizer_streaming_testdata_size(); ++i) {
if (strcmp(toc[i].name, name) == 0) {
return iree_make_string_view(toc[i].data, toc[i].size);
}
}
return iree_string_view_empty();
}
struct TokenizerDeleter {
void operator()(iree_tokenizer_t* t) const { iree_tokenizer_free(t); }
};
using ScopedTokenizer = std::unique_ptr<iree_tokenizer_t, TokenizerDeleter>;
static iree::StatusOr<ScopedTokenizer> LoadTokenizer(const char* filename) {
iree_string_view_t json = GetEmbeddedFile(filename);
if (json.size == 0) {
return iree_make_status(IREE_STATUS_NOT_FOUND, "file '%s' not in testdata",
filename);
}
iree_tokenizer_t* tokenizer = nullptr;
IREE_RETURN_IF_ERROR(iree_tokenizer_from_huggingface_json(
json, iree_allocator_system(), &tokenizer));
return ScopedTokenizer(tokenizer);
}
// Encodes text using the one-shot API.
static StatusOr<std::vector<iree_tokenizer_token_id_t>> EncodeOneShot(
iree_tokenizer_t* tokenizer, iree_string_view_t text) {
std::vector<iree_tokenizer_token_id_t> ids(256);
iree_host_size_t count = 0;
IREE_RETURN_IF_ERROR(
iree_tokenizer_encode(tokenizer, text, IREE_TOKENIZER_ENCODE_FLAG_NONE,
iree_tokenizer_make_token_output(
ids.data(), nullptr, nullptr, ids.size()),
iree_allocator_system(), &count));
ids.resize(count);
return ids;
}
// Encodes text using streaming API with specified transform buffer size.
static StatusOr<std::vector<iree_tokenizer_token_id_t>> EncodeStreaming(
iree_tokenizer_t* tokenizer, iree_string_view_t text,
iree_host_size_t buffer_size, iree_host_size_t chunk_size = 0) {
iree_host_size_t state_size = 0;
IREE_RETURN_IF_ERROR(
iree_tokenizer_encode_state_calculate_size(tokenizer, &state_size));
std::vector<uint8_t> state_storage(state_size);
std::vector<uint8_t> transform_buffer(buffer_size);
iree_tokenizer_encode_state_t* state = nullptr;
IREE_RETURN_IF_ERROR(iree_tokenizer_encode_state_initialize(
tokenizer,
iree_make_byte_span(state_storage.data(), state_storage.size()),
iree_make_byte_span(transform_buffer.data(), transform_buffer.size()),
iree_tokenizer_offset_run_list_empty(),
IREE_TOKENIZER_ENCODE_FLAG_AT_INPUT_START, &state));
std::vector<iree_tokenizer_token_id_t> all_ids;
std::vector<iree_tokenizer_token_id_t> batch_ids(256);
iree_host_size_t offset = 0;
while (offset < text.size) {
iree_host_size_t remaining = text.size - offset;
iree_host_size_t feed_size =
(chunk_size > 0 && chunk_size < remaining) ? chunk_size : remaining;
iree_string_view_t chunk =
iree_make_string_view(text.data + offset, feed_size);
iree_host_size_t consumed = 0;
iree_host_size_t produced = 0;
iree_status_t status = iree_tokenizer_encode_state_feed(
state, chunk,
iree_tokenizer_make_token_output(batch_ids.data(), nullptr, nullptr,
batch_ids.size()),
&consumed, &produced);
if (!iree_status_is_ok(status)) {
iree_tokenizer_encode_state_deinitialize(state);
return status;
}
for (iree_host_size_t i = 0; i < produced; ++i) {
all_ids.push_back(batch_ids[i]);
}
offset += consumed;
if (consumed == 0 && produced == 0 && feed_size > 0) {
iree_tokenizer_encode_state_deinitialize(state);
return iree_make_status(IREE_STATUS_INTERNAL, "no progress at offset %zu",
offset);
}
}
// Drain finalize - may produce multiple batches for large outputs.
iree_host_size_t finalize_count = 0;
do {
iree_status_t status = iree_tokenizer_encode_state_finalize(
state,
iree_tokenizer_make_token_output(batch_ids.data(), nullptr, nullptr,
batch_ids.size()),
&finalize_count);
if (!iree_status_is_ok(status)) {
iree_tokenizer_encode_state_deinitialize(state);
return status;
}
for (iree_host_size_t i = 0; i < finalize_count; ++i) {
all_ids.push_back(batch_ids[i]);
}
} while (finalize_count == batch_ids.size());
iree_tokenizer_encode_state_deinitialize(state);
return all_ids;
}
//===----------------------------------------------------------------------===//
// BPE ByteLevel Tests
//===----------------------------------------------------------------------===//
class BPEByteLevelTest : public ::testing::Test {
protected:
void SetUp() override {
auto result = LoadTokenizer("bpe_bytelevel_minimal.json");
if (!result.ok()) {
GTEST_SKIP() << result.status().ToString();
}
tokenizer_ = std::move(result.value());
}
iree_tokenizer_t* tokenizer() { return tokenizer_.get(); }
private:
ScopedTokenizer tokenizer_;
};
TEST_F(BPEByteLevelTest, LoadsSuccessfully) { ASSERT_NE(tokenizer(), nullptr); }
TEST_F(BPEByteLevelTest, EncodeHelloWorld) {
IREE_ASSERT_OK_AND_ASSIGN(auto ids,
EncodeOneShot(tokenizer(), IREE_SV("hello world")));
// Correct BPE merge order: "hello" -> ["he", "llo"], " world" -> ["Ä world"]
// The merge "l l" at rank 1 fires before "he l" at rank 9, so "hello"
// tokenizes as ["he", "llo"] not ["hello"].
EXPECT_EQ(ids.size(), 3u);
EXPECT_EQ(ids[0], 98); // he
EXPECT_EQ(ids[1], 105); // llo
EXPECT_EQ(ids[2], 110); // Ä world
}
TEST_F(BPEByteLevelTest, BufferInvarianceHelloWorld) {
iree_string_view_t text = IREE_SV("hello world");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Test various buffer sizes - all should produce same output.
for (iree_host_size_t buffer_size : {256, 512, 1024, 4096, 16384}) {
IREE_ASSERT_OK_AND_ASSIGN(auto result,
EncodeStreaming(tokenizer(), text, buffer_size));
EXPECT_EQ(result, reference)
<< "Buffer size " << buffer_size << " differs from one-shot";
}
}
TEST_F(BPEByteLevelTest, ChunkInvarianceHelloWorld) {
iree_string_view_t text = IREE_SV("hello world");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Test various chunk sizes with fixed buffer.
for (iree_host_size_t chunk_size : {1, 2, 3, 4, 5, 7, 11}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size << " differs from one-shot";
}
}
TEST_F(BPEByteLevelTest, BufferInvarianceTheQuickBrown) {
iree_string_view_t text = IREE_SV("the quick brown fox");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
for (iree_host_size_t buffer_size : {256, 512, 1024, 4096}) {
IREE_ASSERT_OK_AND_ASSIGN(auto result,
EncodeStreaming(tokenizer(), text, buffer_size));
EXPECT_EQ(result, reference)
<< "Buffer size " << buffer_size << " differs from one-shot";
}
}
TEST_F(BPEByteLevelTest, ChunkInvarianceTheQuickBrown) {
iree_string_view_t text = IREE_SV("the quick brown fox");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
for (iree_host_size_t chunk_size : {1, 2, 3, 5, 7}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size << " differs from one-shot";
}
}
TEST_F(BPEByteLevelTest, EmptyInput) {
IREE_ASSERT_OK_AND_ASSIGN(
auto ids, EncodeOneShot(tokenizer(), iree_string_view_empty()));
EXPECT_TRUE(ids.empty());
IREE_ASSERT_OK_AND_ASSIGN(
auto streaming,
EncodeStreaming(tokenizer(), iree_string_view_empty(), 4096));
EXPECT_TRUE(streaming.empty());
}
TEST_F(BPEByteLevelTest, SingleCharacter) {
iree_string_view_t text = IREE_SV("a");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
IREE_ASSERT_OK_AND_ASSIGN(auto streaming,
EncodeStreaming(tokenizer(), text, 256));
EXPECT_EQ(streaming, reference);
}
TEST_F(BPEByteLevelTest, RepeatedCharacters) {
iree_string_view_t text = IREE_SV("aaaaaaaaaa");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
for (iree_host_size_t buffer_size : {256, 512, 1024}) {
IREE_ASSERT_OK_AND_ASSIGN(auto result,
EncodeStreaming(tokenizer(), text, buffer_size));
EXPECT_EQ(result, reference)
<< "Buffer size " << buffer_size << " differs for repeated chars";
}
}
TEST_F(BPEByteLevelTest, MultipleSpaces) {
iree_string_view_t text = IREE_SV("hello world"); // Two spaces.
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
for (iree_host_size_t chunk_size : {1, 2, 3}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size << " differs for multiple spaces";
}
}
// Tests that " the" correctly merges through the chain: Ä +t -> Ä t -> Ä th ->
// Ä the even when the space and word are split across chunk boundaries.
TEST_F(BPEByteLevelTest, SpacePrefixedWordMerge) {
iree_string_view_t text = IREE_SV("hello the world");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Chunk sizes that force " the" to split at different points:
// size 5: "hello" | " the " | "world" - space at chunk boundary
// size 6: "hello " | "the wo" | "rld" - after space
// size 7: "hello t" | "he worl" | "d" - mid-word
for (iree_host_size_t chunk_size : {1, 2, 3, 5, 6, 7}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size << " differs for space-prefixed word";
}
}
// Tests special token recognition when the token spans chunk boundaries.
// "<|endoftext|>" is 13 characters - splitting it should still produce one
// token.
TEST_F(BPEByteLevelTest, SpecialTokenSpanning) {
iree_string_view_t text = IREE_SV("<|endoftext|>");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Special token should be recognized as a single token (ID 111).
EXPECT_EQ(reference.size(), 1u);
EXPECT_EQ(reference[0], 111);
// Feed the special token character by character.
for (iree_host_size_t chunk_size : {1, 2, 3, 4, 5, 6, 7}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size << " differs for special token spanning";
}
}
//===----------------------------------------------------------------------===//
// Special Token Negative Tests
//===----------------------------------------------------------------------===//
// Tests that text around special tokens doesn't incorrectly match.
// "x<|endoftext|>y" should NOT match the special token.
TEST_F(BPEByteLevelTest, SpecialTokenNegative) {
iree_string_view_t text = IREE_SV("x<|endoftext|>y");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Should NOT be a single token - the surrounding chars prevent special match.
EXPECT_GT(reference.size(), 1u);
for (iree_host_size_t chunk_size : {1, 2, 3}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size << " differs for special token negative";
}
}
// Tests behavior at exact buffer boundaries - fill buffer exactly then
// overflow.
TEST_F(BPEByteLevelTest, BufferBoundaryExact) {
// 64 'a' characters to test exact buffer fill and slight overflow.
std::string text_64(64, 'a');
std::string text_65(65, 'a');
std::string text_128(128, 'a');
std::string text_129(129, 'a');
for (const auto& text : {text_64, text_65, text_128, text_129}) {
iree_string_view_t sv = iree_make_string_view(text.data(), text.size());
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), sv));
// Test with buffer sizes that match or nearly match input size.
for (iree_host_size_t buffer_size : {64, 128, 256}) {
IREE_ASSERT_OK_AND_ASSIGN(auto result,
EncodeStreaming(tokenizer(), sv, buffer_size));
EXPECT_EQ(result, reference)
<< "Buffer size " << buffer_size << " differs for " << text.size()
<< " chars";
}
}
}
// Tests streaming with empty chunks interspersed between content.
TEST_F(BPEByteLevelTest, EmptyChunks) {
iree_string_view_t text = IREE_SV("hello");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Feed character by character - the implementation should handle this.
IREE_ASSERT_OK_AND_ASSIGN(auto streaming,
EncodeStreaming(tokenizer(), text, 4096, 1));
EXPECT_EQ(streaming, reference);
}
//===----------------------------------------------------------------------===//
// Post-Normalization Special Token Tests
//===----------------------------------------------------------------------===//
// Post-normalization special tokens have normalized=true and are matched AFTER
// the normalizer runs. This test suite verifies correct behavior when these
// tokens appear in various positions and streaming scenarios.
class PostNormSpecialTokenTest : public ::testing::Test {
protected:
void SetUp() override {
// Minimal BPE tokenizer with:
// - Regular vocab tokens for basic text
// - A post-normalization special token (normalized=true)
const char* json = R"({
"model": {
"type": "BPE",
"vocab": {
"h": 0, "e": 1, "l": 2, "o": 3, " ": 4, "w": 5, "r": 6, "d": 7,
"he": 8, "ll": 9, "lo": 10,
"<|special|>": 100
},
"merges": ["h e", "l l", "l o"]
},
"added_tokens": [{
"id": 100,
"content": "<|special|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
}],
"pre_tokenizer": {"type": "ByteLevel", "add_prefix_space": false, "trim_offsets": false, "use_regex": true}
})";
iree_tokenizer_t* tokenizer = nullptr;
iree_status_t status = iree_tokenizer_from_huggingface_json(
iree_make_string_view(json, strlen(json)), iree_allocator_system(),
&tokenizer);
IREE_CHECK_OK(status);
tokenizer_.reset(tokenizer);
}
iree_tokenizer_t* tokenizer() { return tokenizer_.get(); }
private:
ScopedTokenizer tokenizer_;
};
// Verifies that post-norm special tokens appearing after regular text are
// correctly matched. When streaming, the special token may be encountered while
// segments from earlier text are still pending for the model. The
// implementation must not corrupt ring buffer state by advancing read_position
// prematurely.
TEST_F(PostNormSpecialTokenTest, SpecialTokenAfterText) {
iree_string_view_t text = IREE_SV("hello <|special|>");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// The last token should be the special token (id=100).
EXPECT_EQ(reference.back(), 100u);
// Streaming with various chunk sizes must produce identical results.
for (iree_host_size_t chunk_size : {1, 2, 3, 5, 7}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size
<< " produced different result for post-norm special token after text";
}
}
// Verifies that post-norm special tokens at the start of input work correctly.
TEST_F(PostNormSpecialTokenTest, SpecialTokenAtStart) {
iree_string_view_t text = IREE_SV("<|special|>hello");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// The first token should be the special token (id=100).
EXPECT_EQ(reference.front(), 100u);
for (iree_host_size_t chunk_size : {1, 2, 3}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size
<< " produced different result for post-norm special token at start";
}
}
// Verifies that multiple post-norm special tokens in sequence work correctly.
TEST_F(PostNormSpecialTokenTest, MultipleSpecialTokens) {
iree_string_view_t text = IREE_SV("<|special|><|special|>");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Should be exactly two special tokens.
EXPECT_EQ(reference.size(), 2u);
EXPECT_EQ(reference[0], 100u);
EXPECT_EQ(reference[1], 100u);
for (iree_host_size_t chunk_size : {1, 2, 3}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size
<< " produced different result for multiple post-norm special tokens";
}
}
// Verifies that streaming with very small buffers handles post-norm tokens
// correctly when interleaved with regular text. This stresses the ring buffer
// management when segments are pending.
TEST_F(PostNormSpecialTokenTest, SmallBufferWithInterleavedTokens) {
iree_string_view_t text = IREE_SV("hello <|special|> world");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Use a small buffer to stress the ring buffer management.
// The special token should appear somewhere in the middle.
bool found_special = false;
for (auto id : reference) {
if (id == 100u) found_special = true;
}
EXPECT_TRUE(found_special) << "Special token not found in reference output";
// Small buffer sizes stress the segment queuing logic.
for (iree_host_size_t buffer_size : {64, 128, 256}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, buffer_size, 1));
EXPECT_EQ(result, reference)
<< "Buffer size " << buffer_size
<< " with 1-byte chunks produced different result";
}
}
//===----------------------------------------------------------------------===//
// Whitespace Pre-Tokenizer Tests
//===----------------------------------------------------------------------===//
// Tests that the Whitespace pre_tokenizer works correctly with streaming
// encoding, including proper word boundary detection across chunk boundaries.
class WhitespacePreTokenizerTest : public ::testing::Test {
protected:
void SetUp() override {
// BPE tokenizer with Whitespace pre_tokenizer.
// Vocab and merges must be consistent: every merge operand must exist.
// "hello" merges: h+e→he, l+l→ll, ll+o→llo, he+llo→hello
// "world" merges: w+o→wo, r+l→rl, rl+d→rld, wo+rld→world
const char* json = R"({
"model": {
"type": "BPE",
"vocab": {
"h": 0, "e": 1, "l": 2, "o": 3, "w": 4, "r": 5, "d": 6,
"he": 10, "ll": 11, "wo": 12, "rl": 13,
"llo": 20, "rld": 21,
"hello": 30, "world": 31
},
"merges": [
"h e", "l l", "w o", "r l",
"ll o", "rl d",
"he llo", "wo rld"
]
},
"pre_tokenizer": {"type": "Whitespace"}
})";
iree_tokenizer_t* tokenizer = nullptr;
iree_status_t status = iree_tokenizer_from_huggingface_json(
iree_make_string_view(json, strlen(json)), iree_allocator_system(),
&tokenizer);
IREE_CHECK_OK(status);
tokenizer_.reset(tokenizer);
}
iree_tokenizer_t* tokenizer() { return tokenizer_.get(); }
private:
ScopedTokenizer tokenizer_;
};
TEST_F(WhitespacePreTokenizerTest, LoadsSuccessfully) {
EXPECT_NE(tokenizer(), nullptr);
}
TEST_F(WhitespacePreTokenizerTest, SingleWord) {
iree_string_view_t text = IREE_SV("hello");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// "hello" should merge to a single token (ID 30).
EXPECT_EQ(reference.size(), 1u);
EXPECT_EQ(reference[0], 30);
}
TEST_F(WhitespacePreTokenizerTest, TwoWords) {
iree_string_view_t text = IREE_SV("hello world");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// "hello world" -> ["hello", "world"] -> [30, 31].
EXPECT_EQ(reference.size(), 2u);
EXPECT_EQ(reference[0], 30);
EXPECT_EQ(reference[1], 31);
}
TEST_F(WhitespacePreTokenizerTest, StreamingChunkInvariance) {
iree_string_view_t text = IREE_SV("hello world");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Test various chunk sizes that split at different positions:
// "h|ello world", "he|llo world", "hel|lo world", "hell|o world",
// "hello| world", "hello |world", etc.
for (iree_host_size_t chunk_size = 1; chunk_size <= 11; ++chunk_size) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size << " produced different result";
}
}
TEST_F(WhitespacePreTokenizerTest, StreamingBufferInvariance) {
iree_string_view_t text = IREE_SV("hello world");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Test various buffer sizes that affect ring buffer behavior.
for (iree_host_size_t buffer_size : {16, 32, 64, 128, 256, 4096}) {
IREE_ASSERT_OK_AND_ASSIGN(auto result,
EncodeStreaming(tokenizer(), text, buffer_size));
EXPECT_EQ(result, reference)
<< "Buffer size " << buffer_size << " produced different result";
}
}
TEST_F(WhitespacePreTokenizerTest, MultipleSpaces) {
iree_string_view_t text = IREE_SV("hello world");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Multiple spaces between words should still produce two tokens.
EXPECT_EQ(reference.size(), 2u);
for (iree_host_size_t chunk_size : {1, 2, 3, 5}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference)
<< "Chunk size " << chunk_size << " differs for multiple spaces";
}
}
TEST_F(WhitespacePreTokenizerTest, LeadingAndTrailingWhitespace) {
iree_string_view_t text = IREE_SV(" hello world ");
IREE_ASSERT_OK_AND_ASSIGN(auto reference, EncodeOneShot(tokenizer(), text));
// Leading and trailing whitespace should be ignored.
EXPECT_EQ(reference.size(), 2u);
for (iree_host_size_t chunk_size : {1, 2, 3, 5}) {
IREE_ASSERT_OK_AND_ASSIGN(
auto result, EncodeStreaming(tokenizer(), text, 4096, chunk_size));
EXPECT_EQ(result, reference) << "Chunk size " << chunk_size
<< " differs for leading/trailing whitespace";
}
}
TEST_F(WhitespacePreTokenizerTest, WhitespaceSplitAlias) {
// WhitespaceSplit is an alias for Whitespace - verify it loads successfully.
const char* json = R"({
"model": {
"type": "BPE",
"vocab": {"a": 0},
"merges": []
},
"pre_tokenizer": {"type": "WhitespaceSplit"}
})";
iree_tokenizer_t* tokenizer = nullptr;
iree_status_t status = iree_tokenizer_from_huggingface_json(
iree_make_string_view(json, strlen(json)), iree_allocator_system(),
&tokenizer);
IREE_EXPECT_OK(status);
if (tokenizer) iree_tokenizer_free(tokenizer);
}
//===----------------------------------------------------------------------===//
// HuggingFace Ground Truth Validation Tests
//===----------------------------------------------------------------------===//
// These tests verify that IREE's tokenizer produces IDENTICAL output to
// HuggingFace across multiple buffer sizes. Expected token IDs are generated
// from the HuggingFace tokenizers library:
// uv run --with tokenizers python generate_testdata_expected_ids.py
// Buffer sizes to test. Smaller sizes force partial-segment mode.
constexpr iree_host_size_t kGroundTruthBufferSizes[] = {64, 128, 256, 512,
65536};
// Expected token IDs from HuggingFace tokenizers library.
// "hello" -> ['he', 'llo']
static constexpr iree_tokenizer_token_id_t kHelloExpected[] = {98, 105};
// "hello world" -> ['he', 'llo', 'Ä world']
static constexpr iree_tokenizer_token_id_t kHelloWorldExpected[] = {98, 105,
110};
// "The quick brown fox" -> individual chars with Ä  for spaces
static constexpr iree_tokenizer_token_id_t kSimpleASCIIExpected[] = {
51, 98, 94, 80, 84, 72, 66, 74, 94, 65, 81, 78, 86, 77, 94, 69, 78, 87};
// "hello<|endoftext|>world" -> ['he', 'llo', '<|endoftext|>', 'w', 'orld']
static constexpr iree_tokenizer_token_id_t kWithSpecialExpected[] = {
98, 105, 111, 86, 108};
// " hello world " -> spaces become Ä  tokens
static constexpr iree_tokenizer_token_id_t kWhitespaceExpected[] = {
94, 94, 98, 105, 94, 94, 110, 94, 94};
// "a" -> ['a']
static constexpr iree_tokenizer_token_id_t kSingleCharExpected[] = {64};
// "12345" -> ['1', '2', '3', '4', '5']
static constexpr iree_tokenizer_token_id_t kNumbersExpected[] = {16, 17, 18, 19,
20};
// "!@#$%^&*()" -> individual punctuation tokens
static constexpr iree_tokenizer_token_id_t kPunctuationExpected[] = {
0, 31, 2, 3, 4, 61, 5, 9, 7, 8};
// Parameterized test fixture for ground truth validation.
class GroundTruthTest : public ::testing::TestWithParam<iree_host_size_t> {
protected:
void SetUp() override {
iree_string_view_t json_str = GetEmbeddedFile("bpe_bytelevel_minimal.json");
ASSERT_GT(json_str.size, 0u) << "Tokenizer JSON not found";
IREE_ASSERT_OK(iree_tokenizer_from_huggingface_json(
json_str, iree_allocator_system(), &tokenizer_));
}
void TearDown() override {
if (tokenizer_) {
iree_tokenizer_free(tokenizer_);
}
}
// Encode input text using the current buffer size (from parameterized test).
std::vector<iree_tokenizer_token_id_t> EncodeWithBufferSize(
const char* input, iree_host_size_t buffer_size) {
iree_host_size_t state_size = 0;
IREE_EXPECT_OK(
iree_tokenizer_encode_state_calculate_size(tokenizer_, &state_size));
std::vector<uint8_t> state_storage(state_size);
std::vector<uint8_t> transform_buffer(buffer_size);
iree_tokenizer_encode_state_t* state = nullptr;
IREE_EXPECT_OK(iree_tokenizer_encode_state_initialize(
tokenizer_,
iree_make_byte_span(state_storage.data(), state_storage.size()),
iree_make_byte_span(transform_buffer.data(), transform_buffer.size()),
iree_tokenizer_offset_run_list_empty(),
IREE_TOKENIZER_ENCODE_FLAG_AT_INPUT_START, &state));
std::vector<iree_tokenizer_token_id_t> tokens(1024);
iree_host_size_t total_tokens = 0;
iree_string_view_t input_view = iree_make_string_view(input, strlen(input));
iree_host_size_t bytes_consumed = 0;
iree_host_size_t token_count = 0;
IREE_EXPECT_OK(iree_tokenizer_encode_state_feed(
state, input_view,
iree_tokenizer_make_token_output(tokens.data(), NULL, NULL,
tokens.size()),
&bytes_consumed, &token_count));
total_tokens += token_count;
iree_host_size_t finalize_count = 0;
IREE_EXPECT_OK(iree_tokenizer_encode_state_finalize(
state,
iree_tokenizer_make_token_output(tokens.data() + total_tokens, NULL,
NULL, tokens.size() - total_tokens),
&finalize_count));
total_tokens += finalize_count;
tokens.resize(total_tokens);
iree_tokenizer_encode_state_deinitialize(state);
return tokens;
}
// Encode using byte-by-byte feeding (most stressful streaming test).
std::vector<iree_tokenizer_token_id_t> EncodeByteByByte(
const char* input, iree_host_size_t buffer_size) {
iree_host_size_t state_size = 0;
IREE_EXPECT_OK(
iree_tokenizer_encode_state_calculate_size(tokenizer_, &state_size));
std::vector<uint8_t> state_storage(state_size);
std::vector<uint8_t> transform_buffer(buffer_size);
iree_tokenizer_encode_state_t* state = nullptr;
IREE_EXPECT_OK(iree_tokenizer_encode_state_initialize(
tokenizer_,
iree_make_byte_span(state_storage.data(), state_storage.size()),
iree_make_byte_span(transform_buffer.data(), transform_buffer.size()),
iree_tokenizer_offset_run_list_empty(),
IREE_TOKENIZER_ENCODE_FLAG_AT_INPUT_START, &state));
std::vector<iree_tokenizer_token_id_t> tokens(1024);
iree_host_size_t total_tokens = 0;
size_t input_length = strlen(input);
for (size_t i = 0; i < input_length; ++i) {
iree_string_view_t chunk = iree_make_string_view(input + i, 1);
iree_host_size_t bytes_consumed = 0;
iree_host_size_t token_count = 0;
IREE_EXPECT_OK(iree_tokenizer_encode_state_feed(
state, chunk,
iree_tokenizer_make_token_output(tokens.data() + total_tokens, NULL,
NULL, tokens.size() - total_tokens),
&bytes_consumed, &token_count));
total_tokens += token_count;
}
iree_host_size_t finalize_count = 0;
IREE_EXPECT_OK(iree_tokenizer_encode_state_finalize(
state,
iree_tokenizer_make_token_output(tokens.data() + total_tokens, NULL,
NULL, tokens.size() - total_tokens),
&finalize_count));
total_tokens += finalize_count;
tokens.resize(total_tokens);
iree_tokenizer_encode_state_deinitialize(state);
return tokens;
}
iree_tokenizer_t* tokenizer_ = nullptr;
};
TEST_P(GroundTruthTest, HelloWorld) {
iree_host_size_t buffer_size = GetParam();
auto tokens = EncodeWithBufferSize("hello world", buffer_size);
std::vector<iree_tokenizer_token_id_t> expected(
kHelloWorldExpected,
kHelloWorldExpected +
sizeof(kHelloWorldExpected) / sizeof(kHelloWorldExpected[0]));
EXPECT_EQ(tokens, expected)
<< "Buffer size: " << buffer_size << ", input: 'hello world'";
}
TEST_P(GroundTruthTest, HelloWorldByteByByte) {
iree_host_size_t buffer_size = GetParam();
auto tokens = EncodeByteByByte("hello world", buffer_size);
std::vector<iree_tokenizer_token_id_t> expected(
kHelloWorldExpected,
kHelloWorldExpected +
sizeof(kHelloWorldExpected) / sizeof(kHelloWorldExpected[0]));
EXPECT_EQ(tokens, expected) << "Buffer size: " << buffer_size
<< ", input: 'hello world' (byte-by-byte)";
}
TEST_P(GroundTruthTest, SimpleASCII) {
iree_host_size_t buffer_size = GetParam();
auto tokens = EncodeWithBufferSize("The quick brown fox", buffer_size);
std::vector<iree_tokenizer_token_id_t> expected(
kSimpleASCIIExpected,
kSimpleASCIIExpected +
sizeof(kSimpleASCIIExpected) / sizeof(kSimpleASCIIExpected[0]));
EXPECT_EQ(tokens, expected)
<< "Buffer size: " << buffer_size << ", input: 'The quick brown fox'";
}
TEST_P(GroundTruthTest, WithSpecialToken) {
iree_host_size_t buffer_size = GetParam();
auto tokens = EncodeWithBufferSize("hello<|endoftext|>world", buffer_size);
std::vector<iree_tokenizer_token_id_t> expected(
kWithSpecialExpected,
kWithSpecialExpected +
sizeof(kWithSpecialExpected) / sizeof(kWithSpecialExpected[0]));
EXPECT_EQ(tokens, expected)
<< "Buffer size: " << buffer_size << ", input: 'hello<|endoftext|>world'";
}
TEST_P(GroundTruthTest, Whitespace) {
iree_host_size_t buffer_size = GetParam();
auto tokens = EncodeWithBufferSize(" hello world ", buffer_size);
std::vector<iree_tokenizer_token_id_t> expected(
kWhitespaceExpected,
kWhitespaceExpected +
sizeof(kWhitespaceExpected) / sizeof(kWhitespaceExpected[0]));
EXPECT_EQ(tokens, expected)
<< "Buffer size: " << buffer_size << ", input: ' hello world '";
}
TEST_P(GroundTruthTest, SingleChar) {
iree_host_size_t buffer_size = GetParam();
auto tokens = EncodeWithBufferSize("a", buffer_size);
std::vector<iree_tokenizer_token_id_t> expected(
kSingleCharExpected,
kSingleCharExpected +
sizeof(kSingleCharExpected) / sizeof(kSingleCharExpected[0]));
EXPECT_EQ(tokens, expected)
<< "Buffer size: " << buffer_size << ", input: 'a'";
}
TEST_P(GroundTruthTest, Numbers) {
iree_host_size_t buffer_size = GetParam();
auto tokens = EncodeWithBufferSize("12345", buffer_size);
std::vector<iree_tokenizer_token_id_t> expected(
kNumbersExpected, kNumbersExpected + sizeof(kNumbersExpected) /
sizeof(kNumbersExpected[0]));
EXPECT_EQ(tokens, expected)
<< "Buffer size: " << buffer_size << ", input: '12345'";
}
TEST_P(GroundTruthTest, Punctuation) {
iree_host_size_t buffer_size = GetParam();
auto tokens = EncodeWithBufferSize("!@#$%^&*()", buffer_size);
std::vector<iree_tokenizer_token_id_t> expected(
kPunctuationExpected,
kPunctuationExpected +
sizeof(kPunctuationExpected) / sizeof(kPunctuationExpected[0]));
EXPECT_EQ(tokens, expected)
<< "Buffer size: " << buffer_size << ", input: '!@#$%^&*()'";
}
TEST_P(GroundTruthTest, EmptyInput) {
iree_host_size_t buffer_size = GetParam();
auto tokens = EncodeWithBufferSize("", buffer_size);
EXPECT_TRUE(tokens.empty()) << "Buffer size: " << buffer_size
<< ", expected empty output for empty input";
}
// Run all parameterized tests with each buffer size.
INSTANTIATE_TEST_SUITE_P(
BufferSizes, GroundTruthTest, ::testing::ValuesIn(kGroundTruthBufferSizes),
[](const ::testing::TestParamInfo<iree_host_size_t>& info) {
return "BufferSize" + std::to_string(info.param);
});
// Helper to format strings for test output.
static std::string repr(const char* s) {
std::string result = "'";
for (const char* p = s; *p; ++p) {
if (*p == '\n')
result += "\\n";
else if (*p == '\t')
result += "\\t";
else if (*p == '\'')
result += "\\'";
else if (*p < 32)
result += "\\x" + std::to_string((unsigned char)*p);
else
result += *p;
}
result += "'";
return result;
}
// Verifies that ALL buffer sizes produce IDENTICAL output.
TEST(GroundTruthInvarianceTest, AllBufferSizesProduceIdenticalOutput) {
iree_string_view_t json_str = GetEmbeddedFile("bpe_bytelevel_minimal.json");
ASSERT_GT(json_str.size, 0u);
iree_tokenizer_t* tokenizer = nullptr;
IREE_ASSERT_OK(iree_tokenizer_from_huggingface_json(
json_str, iree_allocator_system(), &tokenizer));
const char* test_inputs[] = {
"hello world",
"The quick brown fox jumps over the lazy dog",
"hello<|endoftext|>world",
" lots of whitespace ",
"!@#$%^&*()",
"12345",
"",
};
for (const char* input : test_inputs) {
std::vector<std::vector<iree_tokenizer_token_id_t>> results_per_size;
for (iree_host_size_t buffer_size : kGroundTruthBufferSizes) {
iree_host_size_t state_size = 0;
IREE_EXPECT_OK(
iree_tokenizer_encode_state_calculate_size(tokenizer, &state_size));
std::vector<uint8_t> state_storage(state_size);
std::vector<uint8_t> transform_buffer(buffer_size);
iree_tokenizer_encode_state_t* state = nullptr;
IREE_EXPECT_OK(iree_tokenizer_encode_state_initialize(
tokenizer,
iree_make_byte_span(state_storage.data(), state_storage.size()),
iree_make_byte_span(transform_buffer.data(), transform_buffer.size()),
iree_tokenizer_offset_run_list_empty(),
IREE_TOKENIZER_ENCODE_FLAG_AT_INPUT_START, &state));
std::vector<iree_tokenizer_token_id_t> tokens(1024);
iree_host_size_t total_tokens = 0;
iree_string_view_t input_view =
iree_make_string_view(input, strlen(input));
iree_host_size_t bytes_consumed = 0;
iree_host_size_t token_count = 0;
IREE_EXPECT_OK(iree_tokenizer_encode_state_feed(
state, input_view,
iree_tokenizer_make_token_output(tokens.data(), NULL, NULL,
tokens.size()),
&bytes_consumed, &token_count));
total_tokens += token_count;
iree_host_size_t finalize_count = 0;
IREE_EXPECT_OK(iree_tokenizer_encode_state_finalize(
state,
iree_tokenizer_make_token_output(tokens.data() + total_tokens, NULL,
NULL, tokens.size() - total_tokens),
&finalize_count));
total_tokens += finalize_count;
tokens.resize(total_tokens);
iree_tokenizer_encode_state_deinitialize(state);
results_per_size.push_back(tokens);
}
// Verify all buffer sizes produced identical results.
for (size_t i = 1; i < results_per_size.size(); ++i) {
EXPECT_EQ(results_per_size[0], results_per_size[i])
<< "Input: " << repr(input) << "\n"
<< "Buffer size " << kGroundTruthBufferSizes[0] << " produced "
<< results_per_size[0].size() << " tokens\n"
<< "Buffer size " << kGroundTruthBufferSizes[i] << " produced "
<< results_per_size[i].size() << " tokens";
}
}
iree_tokenizer_free(tokenizer);
}
} // namespace
} // namespace iree::tokenizer