blob: 243f9b337630ebe9eb4a39ed6b6571a3dc6bd78d [file]
// Copyright 2026 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Benchmarks for tokenizer encode/decode performance.
//
// These benchmarks measure:
// - Single encode throughput at various text lengths
// - Batch encode throughput with state reuse
// - Streaming encode overhead vs batch
// - Cold start latency (including tokenizer construction)
// - Decode throughput via pre-decoded fast path
//
// Run with: iree-bazel-run //runtime/src/iree/tokenizer:tokenizer_benchmark
// Filter: --benchmark_filter='Encode'
//
// Custom text file (via environment variable):
// IREE_BENCHMARK_TEXT_FILE=/path/to/file.txt iree-bazel-run ...
// When set, uses the file contents instead of generated text.
// The text_bytes argument becomes a limit on how much of the file to use.
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <string>
#include <vector>
#include "benchmark/benchmark.h"
#include "iree/base/api.h"
#include "iree/tokenizer/decoder/byte_level.h"
#include "iree/tokenizer/model/bpe.h"
#include "iree/tokenizer/segmenter/whitespace.h"
#include "iree/tokenizer/tokenizer.h"
#include "iree/tokenizer/vocab/vocab.h"
#include "iree/tokenizer/vocab/vocab_builder.h"
namespace {
//===----------------------------------------------------------------------===//
// Text Generation
//===----------------------------------------------------------------------===//
// Common English words for realistic text generation.
// Top 100 words by frequency, covering ~50% of typical English text.
static const char* const kCommonWords[] = {
"the", "of", "and", "to", "a", "in", "is", "it",
"you", "that", "he", "was", "for", "on", "are", "with",
"as", "his", "they", "be", "at", "one", "have", "this",
"from", "or", "had", "by", "not", "word", "but", "what",
"some", "we", "can", "out", "other", "were", "all", "there",
"when", "up", "use", "your", "how", "said", "an", "each",
"she", "which", "do", "their", "time", "if", "will", "way",
"about", "many", "then", "them", "write", "would", "like", "so",
"these", "her", "long", "make", "thing", "see", "him", "two",
"has", "look", "more", "day", "could", "go", "come", "did",
"no", "most", "my", "over", "such", "our", "man", "me",
"even", "new", "just", "only", "any", "know", "take", "into",
"year", "good", "give", "after",
};
static constexpr size_t kCommonWordCount =
sizeof(kCommonWords) / sizeof(kCommonWords[0]);
// Generates a deterministic pseudo-random word (for rare tokens).
static std::string GenerateSyntheticWord(uint32_t seed) {
size_t length = 3 + (seed >> 24) % 8;
std::string word;
word.reserve(length);
uint32_t state = seed;
for (size_t i = 0; i < length; ++i) {
state = state * 1103515245 + 12345;
word.push_back('a' + (state >> 16) % 26);
}
return word;
}
// Generates deterministic pseudo-English text of approximately target_length.
static std::string GenerateText(uint32_t seed, size_t target_length) {
std::string result;
result.reserve(target_length + 20);
uint32_t state = seed;
while (result.size() < target_length) {
state = state * 1103515245 + 12345;
// 70% common words, 30% synthetic (mimics rare tokens).
if ((state >> 16) % 10 < 7) {
result += kCommonWords[(state >> 8) % kCommonWordCount];
} else {
result += GenerateSyntheticWord(state);
}
result += " ";
}
return result;
}
// Loads text from file specified by IREE_BENCHMARK_TEXT_FILE env var,
// truncated to target_length. Returns empty string if env var not set or
// file cannot be read.
static std::string LoadTextFromEnvFile(size_t target_length) {
const char* path = std::getenv("IREE_BENCHMARK_TEXT_FILE");
if (!path || path[0] == '\0') return "";
std::ifstream file(path, std::ios::binary);
if (!file) return "";
// Read up to target_length bytes.
std::string result(target_length, '\0');
file.read(result.data(), static_cast<std::streamsize>(target_length));
result.resize(static_cast<size_t>(file.gcount()));
return result;
}
// Returns text for benchmarking: from IREE_BENCHMARK_TEXT_FILE if set,
// otherwise generates pseudo-random text.
static std::string GetBenchmarkText(uint32_t seed, size_t target_length) {
std::string text = LoadTextFromEnvFile(target_length);
if (!text.empty()) return text;
return GenerateText(seed, target_length);
}
//===----------------------------------------------------------------------===//
// Tokenizer Factory
//===----------------------------------------------------------------------===//
// Builds a tokenizer with the specified vocabulary size.
// Structure:
// - 256 byte tokens (indices 0-255)
// - Common English words as whole tokens
// - Synthetic subword tokens to reach target size
// - BPE merges connecting byte tokens to form common patterns
static iree_tokenizer_t* BuildBenchmarkTokenizer(size_t vocab_size) {
iree_tokenizer_vocab_builder_t* vocab_builder = nullptr;
IREE_CHECK_OK(iree_tokenizer_vocab_builder_allocate(
vocab_size, iree_allocator_system(), &vocab_builder));
iree_tokenizer_token_id_t next_id = 0;
// Add byte tokens (0-255).
for (int byte_value = 0; byte_value < 256 && next_id < vocab_size;
++byte_value) {
char byte_str[2] = {static_cast<char>(byte_value), '\0'};
IREE_CHECK_OK(iree_tokenizer_vocab_builder_add_token_with_id(
vocab_builder, next_id++, iree_make_string_view(byte_str, 1), 0.0f,
IREE_TOKENIZER_TOKEN_ATTR_NONE));
}
// Add common English words as single tokens.
for (size_t i = 0; i < kCommonWordCount && next_id < vocab_size; ++i) {
IREE_CHECK_OK(iree_tokenizer_vocab_builder_add_token_with_id(
vocab_builder, next_id++, iree_make_cstring_view(kCommonWords[i]), 0.0f,
IREE_TOKENIZER_TOKEN_ATTR_NONE));
}
// Add BPE merge result tokens.
const char* common_pairs[] = {"th", "he", "in", "er", "an",
"re", "on", "at", "en", "nd"};
const size_t pair_count = sizeof(common_pairs) / sizeof(common_pairs[0]);
for (size_t i = 0; i < pair_count && next_id < vocab_size; ++i) {
IREE_CHECK_OK(iree_tokenizer_vocab_builder_add_token_with_id(
vocab_builder, next_id++, iree_make_string_view(common_pairs[i], 2),
0.0f, IREE_TOKENIZER_TOKEN_ATTR_NONE));
}
// Fill remaining slots with synthetic tokens.
uint32_t seed = 0x12345678;
while (next_id < vocab_size) {
seed = seed * 1103515245 + 12345;
std::string token = GenerateSyntheticWord(seed);
iree_status_t status = iree_tokenizer_vocab_builder_add_token_with_id(
vocab_builder, next_id,
iree_make_string_view(token.data(), token.size()), 0.0f,
IREE_TOKENIZER_TOKEN_ATTR_NONE);
if (iree_status_is_ok(status)) {
++next_id;
} else {
iree_status_ignore(status);
}
}
// Add merge rules linking byte tokens to merged tokens.
for (size_t i = 0; i < pair_count; ++i) {
iree_tokenizer_token_id_t left =
static_cast<iree_tokenizer_token_id_t>(common_pairs[i][0]);
iree_tokenizer_token_id_t right =
static_cast<iree_tokenizer_token_id_t>(common_pairs[i][1]);
IREE_CHECK_OK(
iree_tokenizer_vocab_builder_add_merge(vocab_builder, left, right));
}
iree_tokenizer_vocab_t* vocab = nullptr;
IREE_CHECK_OK(iree_tokenizer_vocab_builder_build(vocab_builder, &vocab));
// Create BPE model.
iree_tokenizer_model_t* model = nullptr;
IREE_CHECK_OK(iree_tokenizer_bpe_model_allocate(
vocab, IREE_TOKENIZER_BPE_FLAG_NONE, iree_allocator_system(), &model));
// Create whitespace segmenter.
iree_tokenizer_segmenter_t* segmenter = nullptr;
IREE_CHECK_OK(iree_tokenizer_segmenter_whitespace_allocate(
iree_allocator_system(), &segmenter));
// Build tokenizer.
iree_tokenizer_builder_t builder;
iree_tokenizer_builder_initialize(iree_allocator_system(), &builder);
iree_tokenizer_builder_set_segmenter(&builder, segmenter);
iree_tokenizer_builder_set_model(&builder, model);
iree_tokenizer_builder_set_vocab(&builder, vocab);
iree_tokenizer_t* tokenizer = nullptr;
IREE_CHECK_OK(iree_tokenizer_builder_build(&builder, &tokenizer));
iree_tokenizer_builder_deinitialize(&builder);
return tokenizer;
}
// Builds a tokenizer with a ByteLevel decoder, enabling pre-decoded fast path.
// The ByteLevel decoder has STATELESS capability, so all vocab tokens get
// pre-decoded at build time — decode becomes a memcpy from a flat table.
static iree_tokenizer_t* BuildBenchmarkTokenizerWithDecoder(size_t vocab_size) {
iree_tokenizer_vocab_builder_t* vocab_builder = nullptr;
IREE_CHECK_OK(iree_tokenizer_vocab_builder_allocate(
vocab_size, iree_allocator_system(), &vocab_builder));
iree_tokenizer_token_id_t next_id = 0;
// Add byte tokens (0-255).
for (int byte_value = 0; byte_value < 256 && next_id < vocab_size;
++byte_value) {
char byte_str[2] = {static_cast<char>(byte_value), '\0'};
IREE_CHECK_OK(iree_tokenizer_vocab_builder_add_token_with_id(
vocab_builder, next_id++, iree_make_string_view(byte_str, 1), 0.0f,
IREE_TOKENIZER_TOKEN_ATTR_NONE));
}
// Add common English words as single tokens.
for (size_t i = 0; i < kCommonWordCount && next_id < vocab_size; ++i) {
IREE_CHECK_OK(iree_tokenizer_vocab_builder_add_token_with_id(
vocab_builder, next_id++, iree_make_cstring_view(kCommonWords[i]), 0.0f,
IREE_TOKENIZER_TOKEN_ATTR_NONE));
}
// Add BPE merge result tokens.
const char* common_pairs[] = {"th", "he", "in", "er", "an",
"re", "on", "at", "en", "nd"};
const size_t pair_count = sizeof(common_pairs) / sizeof(common_pairs[0]);
for (size_t i = 0; i < pair_count && next_id < vocab_size; ++i) {
IREE_CHECK_OK(iree_tokenizer_vocab_builder_add_token_with_id(
vocab_builder, next_id++, iree_make_string_view(common_pairs[i], 2),
0.0f, IREE_TOKENIZER_TOKEN_ATTR_NONE));
}
// Fill remaining slots with synthetic tokens.
uint32_t seed = 0x12345678;
while (next_id < vocab_size) {
seed = seed * 1103515245 + 12345;
std::string token = GenerateSyntheticWord(seed);
iree_status_t status = iree_tokenizer_vocab_builder_add_token_with_id(
vocab_builder, next_id,
iree_make_string_view(token.data(), token.size()), 0.0f,
IREE_TOKENIZER_TOKEN_ATTR_NONE);
if (iree_status_is_ok(status)) {
++next_id;
} else {
iree_status_ignore(status);
}
}
// Add merge rules linking byte tokens.
for (size_t i = 0; i < pair_count; ++i) {
iree_tokenizer_token_id_t left =
static_cast<iree_tokenizer_token_id_t>(common_pairs[i][0]);
iree_tokenizer_token_id_t right =
static_cast<iree_tokenizer_token_id_t>(common_pairs[i][1]);
IREE_CHECK_OK(
iree_tokenizer_vocab_builder_add_merge(vocab_builder, left, right));
}
iree_tokenizer_vocab_t* vocab = nullptr;
IREE_CHECK_OK(iree_tokenizer_vocab_builder_build(vocab_builder, &vocab));
// Create BPE model.
iree_tokenizer_model_t* model = nullptr;
IREE_CHECK_OK(iree_tokenizer_bpe_model_allocate(
vocab, IREE_TOKENIZER_BPE_FLAG_NONE, iree_allocator_system(), &model));
// Create whitespace segmenter.
iree_tokenizer_segmenter_t* segmenter = nullptr;
IREE_CHECK_OK(iree_tokenizer_segmenter_whitespace_allocate(
iree_allocator_system(), &segmenter));
// Create ByteLevel decoder (STATELESS — enables pre-decode fast path).
iree_tokenizer_decoder_t* decoder = nullptr;
IREE_CHECK_OK(iree_tokenizer_decoder_byte_level_allocate(
iree_allocator_system(), &decoder));
// Build tokenizer.
iree_tokenizer_builder_t builder;
iree_tokenizer_builder_initialize(iree_allocator_system(), &builder);
iree_tokenizer_builder_set_segmenter(&builder, segmenter);
iree_tokenizer_builder_set_model(&builder, model);
iree_tokenizer_builder_set_decoder(&builder, decoder);
iree_tokenizer_builder_set_vocab(&builder, vocab);
iree_tokenizer_t* tokenizer = nullptr;
IREE_CHECK_OK(iree_tokenizer_builder_build(&builder, &tokenizer));
iree_tokenizer_builder_deinitialize(&builder);
return tokenizer;
}
//===----------------------------------------------------------------------===//
// Encode Benchmarks
//===----------------------------------------------------------------------===//
// Fixture for encode benchmarks. Caches tokenizers across iterations.
// Args: state.range(0) = vocab_size, state.range(1) = text_length.
//
// Uses a pool of different texts to prevent artificial cache warming from
// processing the same text repeatedly. Each iteration uses a different text.
class EncodeBenchmark : public benchmark::Fixture {
public:
static constexpr size_t kTextPoolSize = 64;
void SetUp(benchmark::State& state) override {
size_t vocab_size = static_cast<size_t>(state.range(0));
size_t text_length = static_cast<size_t>(state.range(1));
tokenizer_ = GetCachedTokenizer(vocab_size);
// Generate pool of different texts to rotate through.
texts_.resize(kTextPoolSize);
for (size_t i = 0; i < kTextPoolSize; ++i) {
texts_[i] =
GetBenchmarkText(static_cast<uint32_t>(0x42420000 + i), text_length);
}
text_index_ = 0;
token_ids_.resize(text_length);
}
void TearDown(benchmark::State&) override {
texts_.clear();
texts_.shrink_to_fit();
token_ids_.clear();
token_ids_.shrink_to_fit();
}
public:
static iree_tokenizer_t* GetCachedTokenizer(size_t vocab_size) {
for (auto& entry : cache_.entries) {
if (entry.vocab_size == vocab_size) return entry.tokenizer;
}
iree_tokenizer_t* tokenizer = BuildBenchmarkTokenizer(vocab_size);
cache_.entries.push_back({vocab_size, tokenizer});
return tokenizer;
}
protected:
iree_tokenizer_t* tokenizer_ = nullptr;
std::vector<std::string> texts_;
size_t text_index_ = 0;
std::vector<iree_tokenizer_token_id_t> token_ids_;
private:
struct CacheEntry {
size_t vocab_size;
iree_tokenizer_t* tokenizer;
};
struct Cache {
std::vector<CacheEntry> entries;
~Cache() {
for (auto& entry : entries) {
iree_tokenizer_free(entry.tokenizer);
}
}
};
static Cache cache_;
};
EncodeBenchmark::Cache EncodeBenchmark::cache_;
// Single text encode: measures throughput in bytes/s and tokens/s.
// Rotates through a pool of different texts to prevent artificial cache warming
// that would occur from encoding the same text repeatedly.
BENCHMARK_DEFINE_F(EncodeBenchmark, Encode)(benchmark::State& state) {
for (auto _ : state) {
const std::string& text = texts_[text_index_];
text_index_ = (text_index_ + 1) % kTextPoolSize;
iree_host_size_t token_count = 0;
IREE_CHECK_OK(iree_tokenizer_encode(
tokenizer_, iree_make_string_view(text.data(), text.size()),
IREE_TOKENIZER_ENCODE_FLAG_NONE,
iree_tokenizer_make_token_output(token_ids_.data(), NULL, NULL,
token_ids_.size()),
iree_allocator_system(), &token_count));
benchmark::DoNotOptimize(token_count);
}
state.SetBytesProcessed(state.iterations() *
static_cast<int64_t>(texts_[0].size()));
}
BENCHMARK_REGISTER_F(EncodeBenchmark, Encode)
->ArgNames({"vocab", "text_bytes"})
// Short text (100B) - tweet-sized.
->Args({1000, 100})
->Args({10000, 100})
->Args({50000, 100})
->Args({128000, 100})
// Medium text (1KB) - paragraph-sized.
->Args({1000, 1024})
->Args({10000, 1024})
->Args({50000, 1024})
->Args({128000, 1024})
// Long text (10KB) - article-sized.
->Args({1000, 10240})
->Args({10000, 10240})
->Args({50000, 10240})
->Args({128000, 10240})
// Very long text (100KB) - document-sized.
->Args({10000, 102400})
->Args({50000, 102400});
//===----------------------------------------------------------------------===//
// Batch Encode Benchmarks
//===----------------------------------------------------------------------===//
// Fixture for batch encode benchmarks.
// Args: state.range(0) = vocab_size, state.range(1) = batch_size,
// state.range(2) = text_length.
class BatchEncodeBenchmark : public benchmark::Fixture {
public:
void SetUp(benchmark::State& state) override {
size_t vocab_size = static_cast<size_t>(state.range(0));
batch_size_ = static_cast<size_t>(state.range(1));
size_t text_length = static_cast<size_t>(state.range(2));
tokenizer_ = EncodeBenchmark::GetCachedTokenizer(vocab_size);
// Generate different text for each batch item.
texts_.resize(batch_size_);
for (size_t i = 0; i < batch_size_; ++i) {
texts_[i] =
GetBenchmarkText(static_cast<uint32_t>(0x12340000 + i), text_length);
}
// Allocate state storage (reused across batch items).
iree_host_size_t state_size = 0;
IREE_CHECK_OK(
iree_tokenizer_encode_state_calculate_size(tokenizer_, &state_size));
state_storage_.resize(state_size);
transform_buffer_.resize(
iree_tokenizer_transform_buffer_recommended_size(text_length));
// Allocate output buffers.
all_token_ids_.resize(batch_size_);
items_.resize(batch_size_);
for (size_t i = 0; i < batch_size_; ++i) {
all_token_ids_[i].resize(text_length);
items_[i].text =
iree_make_string_view(texts_[i].data(), texts_[i].size());
items_[i].output = iree_tokenizer_make_token_output(
all_token_ids_[i].data(), NULL, NULL, all_token_ids_[i].size());
items_[i].out_token_count = 0;
}
bytes_per_batch_ = 0;
for (const auto& text : texts_) {
bytes_per_batch_ += text.size();
}
}
void TearDown(benchmark::State&) override {
texts_.clear();
texts_.shrink_to_fit();
state_storage_.clear();
state_storage_.shrink_to_fit();
transform_buffer_.clear();
transform_buffer_.shrink_to_fit();
all_token_ids_.clear();
all_token_ids_.shrink_to_fit();
items_.clear();
items_.shrink_to_fit();
}
protected:
iree_tokenizer_t* tokenizer_ = nullptr;
size_t batch_size_ = 0;
std::vector<std::string> texts_;
std::vector<uint8_t> state_storage_;
std::vector<uint8_t> transform_buffer_;
std::vector<std::vector<iree_tokenizer_token_id_t>> all_token_ids_;
std::vector<iree_tokenizer_encode_batch_item_t> items_;
size_t bytes_per_batch_ = 0;
};
BENCHMARK_DEFINE_F(BatchEncodeBenchmark, Batch)(benchmark::State& state) {
for (auto _ : state) {
// Reset output counts.
for (size_t i = 0; i < batch_size_; ++i) {
items_[i].out_token_count = 0;
}
IREE_CHECK_OK(iree_tokenizer_encode_batch(
tokenizer_, items_.data(), items_.size(),
IREE_TOKENIZER_ENCODE_FLAG_NONE,
iree_make_byte_span(state_storage_.data(), state_storage_.size()),
iree_make_byte_span(transform_buffer_.data(), transform_buffer_.size()),
iree_tokenizer_offset_run_list_empty()));
iree_host_size_t batch_tokens = 0;
for (size_t i = 0; i < batch_size_; ++i) {
batch_tokens += items_[i].out_token_count;
}
benchmark::DoNotOptimize(batch_tokens);
}
state.SetBytesProcessed(state.iterations() *
static_cast<int64_t>(bytes_per_batch_));
}
BENCHMARK_REGISTER_F(BatchEncodeBenchmark, Batch)
->ArgNames({"vocab", "batch", "text_bytes"})
->Args({10000, 8, 1024})
->Args({50000, 8, 1024})
->Args({10000, 32, 100})
->Args({50000, 32, 100})
->Args({10000, 128, 100});
//===----------------------------------------------------------------------===//
// Streaming Encode Benchmarks
//===----------------------------------------------------------------------===//
// Fixture for streaming encode benchmarks.
// Args: state.range(0) = vocab_size, state.range(1) = text_length,
// state.range(2) = chunk_size.
//
// Uses a pool of different texts to prevent artificial cache warming.
class StreamingEncodeBenchmark : public benchmark::Fixture {
public:
static constexpr size_t kTextPoolSize = 64;
void SetUp(benchmark::State& state) override {
size_t vocab_size = static_cast<size_t>(state.range(0));
size_t text_length = static_cast<size_t>(state.range(1));
chunk_size_ = static_cast<size_t>(state.range(2));
tokenizer_ = EncodeBenchmark::GetCachedTokenizer(vocab_size);
// Generate pool of different texts to rotate through.
texts_.resize(kTextPoolSize);
for (size_t i = 0; i < kTextPoolSize; ++i) {
texts_[i] =
GetBenchmarkText(static_cast<uint32_t>(0x55550000 + i), text_length);
}
text_index_ = 0;
iree_host_size_t state_size = 0;
IREE_CHECK_OK(
iree_tokenizer_encode_state_calculate_size(tokenizer_, &state_size));
state_storage_.resize(state_size);
transform_buffer_.resize(
iree_tokenizer_transform_buffer_recommended_size(chunk_size_));
token_ids_.resize(text_length);
}
void TearDown(benchmark::State&) override {
texts_.clear();
texts_.shrink_to_fit();
state_storage_.clear();
state_storage_.shrink_to_fit();
transform_buffer_.clear();
transform_buffer_.shrink_to_fit();
token_ids_.clear();
token_ids_.shrink_to_fit();
}
protected:
iree_tokenizer_t* tokenizer_ = nullptr;
size_t chunk_size_ = 0;
std::vector<std::string> texts_;
size_t text_index_ = 0;
std::vector<uint8_t> state_storage_;
std::vector<uint8_t> transform_buffer_;
std::vector<iree_tokenizer_token_id_t> token_ids_;
};
// Rotates through a pool of different texts to prevent artificial cache
// warming.
BENCHMARK_DEFINE_F(StreamingEncodeBenchmark, Stream)
(benchmark::State& state) {
for (auto _ : state) {
const std::string& text = texts_[text_index_];
text_index_ = (text_index_ + 1) % kTextPoolSize;
iree_tokenizer_encode_state_t* encode_state = nullptr;
IREE_CHECK_OK(iree_tokenizer_encode_state_initialize(
tokenizer_,
iree_make_byte_span(state_storage_.data(), state_storage_.size()),
iree_make_byte_span(transform_buffer_.data(), transform_buffer_.size()),
iree_tokenizer_offset_run_list_empty(),
IREE_TOKENIZER_ENCODE_FLAG_AT_INPUT_START, &encode_state));
// Feed text in chunks.
iree_host_size_t total_tokens = 0;
iree_string_view_t remaining =
iree_make_string_view(text.data(), text.size());
while (remaining.size > 0) {
size_t this_chunk =
remaining.size < chunk_size_ ? remaining.size : chunk_size_;
iree_string_view_t chunk =
iree_make_string_view(remaining.data, this_chunk);
iree_host_size_t bytes_consumed = 0;
iree_host_size_t tokens_written = 0;
IREE_CHECK_OK(iree_tokenizer_encode_state_feed(
encode_state, chunk,
iree_tokenizer_make_token_output(token_ids_.data() + total_tokens,
NULL, NULL,
token_ids_.size() - total_tokens),
&bytes_consumed, &tokens_written));
total_tokens += tokens_written;
remaining.data += bytes_consumed;
remaining.size -= bytes_consumed;
}
// Finalize.
iree_host_size_t final_tokens = 0;
IREE_CHECK_OK(iree_tokenizer_encode_state_finalize(
encode_state,
iree_tokenizer_make_token_output(token_ids_.data() + total_tokens, NULL,
NULL,
token_ids_.size() - total_tokens),
&final_tokens));
iree_tokenizer_encode_state_deinitialize(encode_state);
benchmark::DoNotOptimize(total_tokens + final_tokens);
}
state.SetBytesProcessed(state.iterations() *
static_cast<int64_t>(texts_[0].size()));
}
BENCHMARK_REGISTER_F(StreamingEncodeBenchmark, Stream)
->ArgNames({"vocab", "text_bytes", "chunk_bytes"})
// Various chunk sizes on 10KB text.
->Args({10000, 10240, 64})
->Args({10000, 10240, 256})
->Args({10000, 10240, 1024})
->Args({10000, 10240, 4096})
// Streaming vs batch comparison on 100KB.
->Args({10000, 102400, 1024});
//===----------------------------------------------------------------------===//
// Cold Start Benchmarks
//===----------------------------------------------------------------------===//
// Measures full pipeline: tokenizer construction + encode.
// No fixture caching — intentionally rebuilds each iteration.
// Args: state.range(0) = vocab_size, state.range(1) = text_length.
void BM_ColdStart(benchmark::State& state) {
size_t vocab_size = static_cast<size_t>(state.range(0));
size_t text_length = static_cast<size_t>(state.range(1));
std::string text = GetBenchmarkText(0xDEADBEEF, text_length);
std::vector<iree_tokenizer_token_id_t> token_ids(text_length);
for (auto _ : state) {
// Build tokenizer (not cached — measuring cold start).
iree_tokenizer_t* tokenizer = BuildBenchmarkTokenizer(vocab_size);
iree_host_size_t token_count = 0;
iree_status_t status = iree_tokenizer_encode(
tokenizer, iree_make_string_view(text.data(), text.size()),
IREE_TOKENIZER_ENCODE_FLAG_NONE,
iree_tokenizer_make_token_output(token_ids.data(), NULL, NULL,
token_ids.size()),
iree_allocator_system(), &token_count);
benchmark::DoNotOptimize(token_count);
iree_status_ignore(status);
iree_tokenizer_free(tokenizer);
}
}
BENCHMARK(BM_ColdStart)
->ArgNames({"vocab", "text_bytes"})
->Args({1000, 1024})
->Args({10000, 1024})
->Args({50000, 1024})
->Args({128000, 1024});
//===----------------------------------------------------------------------===//
// Decode Benchmarks (Pre-decoded Fast Path)
//===----------------------------------------------------------------------===//
// Fixture for decode benchmarks. Caches tokenizers with ByteLevel decoder.
// Args: state.range(0) = vocab_size, state.range(1) = text_length.
//
// Uses a pool of different token sequences to prevent artificial cache warming
// from decoding the same tokens repeatedly. Each iteration uses different
// tokens.
class DecodeBenchmark : public benchmark::Fixture {
public:
static constexpr size_t kTokenPoolSize = 64;
void SetUp(benchmark::State& state) override {
size_t vocab_size = static_cast<size_t>(state.range(0));
size_t text_length = static_cast<size_t>(state.range(1));
tokenizer_ = GetCachedDecodeTokenizer(vocab_size);
// Generate pool of different token sequences to rotate through.
token_sequences_.resize(kTokenPoolSize);
size_t max_tokens = 0;
for (size_t i = 0; i < kTokenPoolSize; ++i) {
std::string text =
GetBenchmarkText(static_cast<uint32_t>(0xAAAA0000 + i), text_length);
std::vector<iree_tokenizer_token_id_t> encode_ids(text_length);
iree_host_size_t token_count = 0;
IREE_CHECK_OK(iree_tokenizer_encode(
tokenizer_, iree_make_string_view(text.data(), text.size()),
IREE_TOKENIZER_ENCODE_FLAG_NONE,
iree_tokenizer_make_token_output(encode_ids.data(), NULL, NULL,
encode_ids.size()),
iree_allocator_system(), &token_count));
token_sequences_[i].assign(encode_ids.begin(),
encode_ids.begin() + token_count);
max_tokens = std::max(max_tokens, static_cast<size_t>(token_count));
}
token_index_ = 0;
// Allocate decode state storage.
iree_host_size_t state_size = 0;
IREE_CHECK_OK(
iree_tokenizer_decode_state_calculate_size(tokenizer_, &state_size));
state_storage_.resize(state_size);
// Allocate output buffer (generous — max ~16x token count for multi-byte).
output_.resize(max_tokens * 16 + 256);
}
void TearDown(benchmark::State&) override {
token_sequences_.clear();
token_sequences_.shrink_to_fit();
state_storage_.clear();
state_storage_.shrink_to_fit();
output_.clear();
output_.shrink_to_fit();
}
public:
static iree_tokenizer_t* GetCachedDecodeTokenizer(size_t vocab_size) {
for (auto& entry : cache_.entries) {
if (entry.vocab_size == vocab_size) return entry.tokenizer;
}
iree_tokenizer_t* tokenizer =
BuildBenchmarkTokenizerWithDecoder(vocab_size);
cache_.entries.push_back({vocab_size, tokenizer});
return tokenizer;
}
protected:
iree_tokenizer_t* tokenizer_ = nullptr;
std::vector<std::vector<iree_tokenizer_token_id_t>> token_sequences_;
size_t token_index_ = 0;
std::vector<uint8_t> state_storage_;
std::vector<char> output_;
private:
struct CacheEntry {
size_t vocab_size;
iree_tokenizer_t* tokenizer;
};
struct Cache {
std::vector<CacheEntry> entries;
~Cache() {
for (auto& entry : entries) {
iree_tokenizer_free(entry.tokenizer);
}
}
};
static Cache cache_;
};
DecodeBenchmark::Cache DecodeBenchmark::cache_;
// Rotates through a pool of different token sequences to prevent artificial
// cache warming that would occur from decoding the same tokens repeatedly.
BENCHMARK_DEFINE_F(DecodeBenchmark, Decode)(benchmark::State& state) {
for (auto _ : state) {
const auto& tokens = token_sequences_[token_index_];
token_index_ = (token_index_ + 1) % kTokenPoolSize;
iree_tokenizer_token_id_list_t id_list = {
/*.count=*/tokens.size(),
/*.values=*/reinterpret_cast<const int32_t*>(tokens.data()),
};
iree_tokenizer_decode_state_t* decode_state = nullptr;
IREE_CHECK_OK(iree_tokenizer_decode_state_initialize(
tokenizer_, IREE_TOKENIZER_DECODE_FLAG_NONE,
iree_make_byte_span(state_storage_.data(), state_storage_.size()),
&decode_state));
iree_host_size_t tokens_consumed = 0;
iree_host_size_t text_length_out = 0;
iree_status_t status = iree_tokenizer_decode_state_feed(
decode_state, id_list,
iree_make_mutable_string_view(output_.data(), output_.size()),
&tokens_consumed, &text_length_out);
iree_status_ignore(status);
iree_host_size_t finalize_length = 0;
status = iree_tokenizer_decode_state_finalize(
decode_state,
iree_make_mutable_string_view(output_.data() + text_length_out,
output_.size() - text_length_out),
&finalize_length);
iree_status_ignore(status);
iree_tokenizer_decode_state_deinitialize(decode_state);
benchmark::DoNotOptimize(text_length_out + finalize_length);
}
state.SetBytesProcessed(state.iterations() *
static_cast<int64_t>(token_sequences_[0].size()));
state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(token_sequences_[0].size()));
}
BENCHMARK_REGISTER_F(DecodeBenchmark, Decode)
->ArgNames({"vocab", "text_bytes"})
// Short token sequence (~25 tokens from 100B text).
->Args({1000, 100})
->Args({10000, 100})
->Args({50000, 100})
// Medium token sequence (~250 tokens from 1KB text).
->Args({1000, 1024})
->Args({10000, 1024})
->Args({50000, 1024})
// Long token sequence (~2500 tokens from 10KB text).
->Args({10000, 10240})
->Args({50000, 10240})
// Very long token sequence (~25000 tokens from 100KB text).
->Args({10000, 102400})
->Args({50000, 102400});
//===----------------------------------------------------------------------===//
// Streaming Decode Benchmarks
//===----------------------------------------------------------------------===//
// Fixture for streaming decode benchmarks. Feeds tokens in small chunks,
// simulating autoregressive LLM inference where tokens arrive one at a time.
// Args: state.range(0) = vocab_size, state.range(1) = text_length,
// state.range(2) = tokens_per_call.
//
// Uses a pool of different token sequences to prevent artificial cache warming.
class StreamingDecodeBenchmark : public benchmark::Fixture {
public:
static constexpr size_t kTokenPoolSize = 64;
void SetUp(benchmark::State& state) override {
size_t vocab_size = static_cast<size_t>(state.range(0));
size_t text_length = static_cast<size_t>(state.range(1));
tokens_per_call_ = static_cast<size_t>(state.range(2));
tokenizer_ = DecodeBenchmark::GetCachedDecodeTokenizer(vocab_size);
// Generate pool of different token sequences to rotate through.
token_sequences_.resize(kTokenPoolSize);
size_t max_tokens = 0;
for (size_t i = 0; i < kTokenPoolSize; ++i) {
std::string text =
GetBenchmarkText(static_cast<uint32_t>(0xBBBB0000 + i), text_length);
std::vector<iree_tokenizer_token_id_t> encode_ids(text_length);
iree_host_size_t token_count = 0;
IREE_CHECK_OK(iree_tokenizer_encode(
tokenizer_, iree_make_string_view(text.data(), text.size()),
IREE_TOKENIZER_ENCODE_FLAG_NONE,
iree_tokenizer_make_token_output(encode_ids.data(), NULL, NULL,
encode_ids.size()),
iree_allocator_system(), &token_count));
token_sequences_[i].assign(encode_ids.begin(),
encode_ids.begin() + token_count);
max_tokens = std::max(max_tokens, static_cast<size_t>(token_count));
}
token_index_ = 0;
// Allocate decode state storage.
iree_host_size_t state_size = 0;
IREE_CHECK_OK(
iree_tokenizer_decode_state_calculate_size(tokenizer_, &state_size));
state_storage_.resize(state_size);
// Allocate output buffer.
output_.resize(max_tokens * 16 + 256);
}
void TearDown(benchmark::State&) override {
token_sequences_.clear();
token_sequences_.shrink_to_fit();
state_storage_.clear();
state_storage_.shrink_to_fit();
output_.clear();
output_.shrink_to_fit();
}
protected:
iree_tokenizer_t* tokenizer_ = nullptr;
size_t tokens_per_call_ = 1;
std::vector<std::vector<iree_tokenizer_token_id_t>> token_sequences_;
size_t token_index_ = 0;
std::vector<uint8_t> state_storage_;
std::vector<char> output_;
};
// Rotates through a pool of different token sequences to prevent artificial
// cache warming that would occur from decoding the same tokens repeatedly.
BENCHMARK_DEFINE_F(StreamingDecodeBenchmark, Stream)
(benchmark::State& state) {
for (auto _ : state) {
const auto& tokens = token_sequences_[token_index_];
token_index_ = (token_index_ + 1) % kTokenPoolSize;
iree_tokenizer_decode_state_t* decode_state = nullptr;
IREE_CHECK_OK(iree_tokenizer_decode_state_initialize(
tokenizer_, IREE_TOKENIZER_DECODE_FLAG_NONE,
iree_make_byte_span(state_storage_.data(), state_storage_.size()),
&decode_state));
iree_host_size_t total_text = 0;
size_t token_position = 0;
while (token_position < tokens.size()) {
size_t chunk = tokens.size() - token_position;
if (chunk > tokens_per_call_) chunk = tokens_per_call_;
iree_tokenizer_token_id_list_t id_list = {
/*.count=*/chunk,
/*.values=*/
reinterpret_cast<const int32_t*>(tokens.data() + token_position),
};
iree_host_size_t tokens_consumed = 0;
iree_host_size_t text_written = 0;
iree_status_t status = iree_tokenizer_decode_state_feed(
decode_state, id_list,
iree_make_mutable_string_view(output_.data() + total_text,
output_.size() - total_text),
&tokens_consumed, &text_written);
iree_status_ignore(status);
token_position += tokens_consumed;
total_text += text_written;
// Avoid infinite loop if no progress.
if (tokens_consumed == 0) break;
}
iree_host_size_t finalize_length = 0;
iree_status_t status = iree_tokenizer_decode_state_finalize(
decode_state,
iree_make_mutable_string_view(output_.data() + total_text,
output_.size() - total_text),
&finalize_length);
iree_status_ignore(status);
iree_tokenizer_decode_state_deinitialize(decode_state);
benchmark::DoNotOptimize(total_text + finalize_length);
}
state.SetBytesProcessed(state.iterations() *
static_cast<int64_t>(token_sequences_[0].size()));
state.SetItemsProcessed(state.iterations() *
static_cast<int64_t>(token_sequences_[0].size()));
}
BENCHMARK_REGISTER_F(StreamingDecodeBenchmark, Stream)
->ArgNames({"vocab", "text_bytes", "tokens_per_call"})
// Autoregressive: 1 token at a time (LLM inference hot path).
->Args({10000, 10240, 1})
->Args({50000, 10240, 1})
// Small batch: speculative decoding or beam search.
->Args({10000, 10240, 10})
->Args({50000, 10240, 10})
// Medium batch.
->Args({10000, 10240, 100})
->Args({50000, 10240, 100})
// Large batch (comparable to full-batch Decode benchmark).
->Args({10000, 10240, 10000})
->Args({50000, 10240, 10000});
} // namespace