| // Copyright 2026 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| // Tokenizes text using HuggingFace tokenizer.json files. |
| // |
| // Example encoding text to token IDs (default: comma-separated): |
| // iree-tokenize --tokenizer=tokenizer.json "Hello, world!" |
| // # Output: 101,7592,1010,2088,999,102 |
| // |
| // Example JSON output: |
| // iree-tokenize --tokenizer=tokenizer.json --json "Hello, world!" |
| // # Output: {"ids":[101,7592,1010,2088,999,102]} |
| // |
| // Example encoding without special tokens: |
| // iree-tokenize --tokenizer=tokenizer.json --special=false "Hello, world!" |
| // # Output: 7592,1010,2088,999 |
| // |
| // Example with offset tracking: |
| // iree-tokenize --tokenizer=tokenizer.json --offsets "Hello, world!" |
| // # Output: 7592[0:5],1010[5:6],2088[7:12],999[12:13] |
| // |
| // Example decoding token IDs to text: |
| // iree-tokenize --tokenizer=tokenizer.json --decode |
| // "101,7592,1010,2088,999,102" |
| // # Output: Hello, world! |
| // |
| // Example batch mode (one line per input): |
| // echo -e "Hello\nWorld" | iree-tokenize --tokenizer=tokenizer.json --batch |
| // # Output: 101,7592,... |
| // # 101,2088,... |
| // |
| // Example showing tokenizer info (always JSON): |
| // iree-tokenize --tokenizer=tokenizer.json --info |
| // # Output: {"vocab_size":30522,"model_type":"BPE",...} |
| // |
| // Example benchmarking: |
| // iree-tokenize --tokenizer=tokenizer.json --benchmark=oneshot "Hello, |
| // world!" # Output: timing stats to stderr, token IDs to stdout |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "iree/base/api.h" |
| #include "iree/base/internal/json.h" |
| #include "iree/base/tooling/flags.h" |
| #include "iree/io/file_contents.h" |
| #include "iree/tokenizer/format/huggingface/tokenizer_json.h" |
| #include "iree/tokenizer/format/tiktoken/tiktoken.h" |
| #include "iree/tokenizer/tokenizer.h" |
| #include "iree/tokenizer/vocab/vocab.h" |
| |
| //===----------------------------------------------------------------------===// |
| // Flags |
| //===----------------------------------------------------------------------===// |
| |
| IREE_FLAG(bool, decode, false, "Decode mode: input is comma-separated IDs."); |
| IREE_FLAG(bool, decode_special, false, |
| "Include special tokens (BOS/EOS) in decode output."); |
| IREE_FLAG(bool, special, true, "Add special tokens (BOS/EOS, CLS/SEP)."); |
| IREE_FLAG(bool, batch, false, "Batch mode: read lines from stdin."); |
| IREE_FLAG(bool, stream, false, "Stream stdin continuously (not line-by-line)."); |
| IREE_FLAG(int32_t, max_length, 0, "Max output length (0 = unlimited)."); |
| IREE_FLAG(bool, info, false, "Show tokenizer info instead of encoding."); |
| IREE_FLAG(bool, json, false, |
| "Output JSON format (default: comma-separated IDs)."); |
| IREE_FLAG(bool, json_string, false, |
| "Input is a JSON-encoded string (handles \\uXXXX escapes)."); |
| IREE_FLAG(string, tokenizer, "", |
| "Path to tokenizer file (.json for HuggingFace, .tiktoken for " |
| "OpenAI tiktoken format)."); |
| IREE_FLAG(string, encoding, "", |
| "Tiktoken encoding name (cl100k_base, o200k_base, r50k_base, " |
| "p50k_base). Required for .tiktoken files; ignored for .json."); |
| IREE_FLAG(bool, offsets, false, "Show token-to-byte offset mappings."); |
| IREE_FLAG(string, benchmark, "", |
| "Benchmark mode: oneshot, batch, stream, or decode."); |
| IREE_FLAG(int32_t, benchmark_iterations, 100, |
| "Number of timed iterations for benchmarking."); |
| IREE_FLAG(int32_t, benchmark_warmup, 5, |
| "Number of warmup iterations before timing."); |
| IREE_FLAG(int32_t, benchmark_chunk_size, 4096, |
| "Chunk size in bytes for stream benchmark."); |
| //===----------------------------------------------------------------------===// |
| // Output Helpers |
| //===----------------------------------------------------------------------===// |
| |
| // Prints a token ID with optional offset annotation. |
| static void iree_tooling_print_token(iree_tokenizer_token_id_t token_id, |
| const iree_tokenizer_offset_t* offset, |
| bool first) { |
| if (!first) fputc(',', stdout); |
| fprintf(stdout, "%" PRId32, token_id); |
| if (offset) { |
| fprintf(stdout, "[%zu:%zu]", (size_t)offset->start, (size_t)offset->end); |
| } |
| } |
| |
| // Prints a token sequence with optional offsets. |
| static void iree_tooling_print_tokens( |
| const iree_tokenizer_token_id_t* token_ids, |
| const iree_tokenizer_offset_t* offsets, iree_host_size_t count, |
| bool* first) { |
| for (iree_host_size_t i = 0; i < count; ++i) { |
| iree_tooling_print_token(token_ids[i], offsets ? &offsets[i] : NULL, |
| *first); |
| *first = false; |
| } |
| } |
| |
| // Prints a JSON array of token IDs with optional offsets. |
| static void iree_tooling_print_json_tokens( |
| const iree_tokenizer_token_id_t* token_ids, |
| const iree_tokenizer_offset_t* offsets, iree_host_size_t count) { |
| fputs("{\"ids\":[", stdout); |
| for (iree_host_size_t i = 0; i < count; ++i) { |
| if (i > 0) fputc(',', stdout); |
| fprintf(stdout, "%" PRId32, token_ids[i]); |
| } |
| fputc(']', stdout); |
| if (offsets) { |
| fputs(",\"offsets\":[", stdout); |
| for (iree_host_size_t i = 0; i < count; ++i) { |
| if (i > 0) fputc(',', stdout); |
| fprintf(stdout, "[%zu,%zu]", (size_t)offsets[i].start, |
| (size_t)offsets[i].end); |
| } |
| fputc(']', stdout); |
| } |
| fputs("}\n", stdout); |
| } |
| |
| // Writes decoded text to stdout with JSON escaping if needed. |
| static void iree_tooling_print_text(const char* data, iree_host_size_t length, |
| bool json_escape) { |
| if (!json_escape) { |
| fwrite(data, 1, length, stdout); |
| return; |
| } |
| for (iree_host_size_t i = 0; i < length; ++i) { |
| char c = data[i]; |
| switch (c) { |
| case '"': |
| fputs("\\\"", stdout); |
| break; |
| case '\\': |
| fputs("\\\\", stdout); |
| break; |
| case '\b': |
| fputs("\\b", stdout); |
| break; |
| case '\f': |
| fputs("\\f", stdout); |
| break; |
| case '\n': |
| fputs("\\n", stdout); |
| break; |
| case '\r': |
| fputs("\\r", stdout); |
| break; |
| case '\t': |
| fputs("\\t", stdout); |
| break; |
| default: |
| if ((unsigned char)c < 0x20) { |
| fprintf(stdout, "\\u%04x", (unsigned char)c); |
| } else { |
| fputc(c, stdout); |
| } |
| break; |
| } |
| } |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Encode (one-shot with retry) |
| //===----------------------------------------------------------------------===// |
| |
| // Builds encode flags from CLI flags. |
| static iree_tokenizer_encode_flags_t iree_tooling_encode_flags(void) { |
| iree_tokenizer_encode_flags_t flags = |
| IREE_TOKENIZER_ENCODE_FLAG_AT_INPUT_START; |
| if (FLAG_special) flags |= IREE_TOKENIZER_ENCODE_FLAG_ADD_SPECIAL_TOKENS; |
| if (FLAG_offsets) flags |= IREE_TOKENIZER_ENCODE_FLAG_TRACK_OFFSETS; |
| return flags; |
| } |
| |
| static iree_status_t iree_tooling_tokenize_encode( |
| const iree_tokenizer_t* tokenizer, iree_string_view_t text, |
| iree_allocator_t allocator) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| iree_tokenizer_encode_flags_t flags = iree_tooling_encode_flags(); |
| |
| // Allocate combined output buffer. Use fixed capacity - the streaming encode |
| // API operates in bounded memory and should never need retries. |
| iree_host_size_t capacity = 8192; |
| iree_host_size_t total_size = 0; |
| iree_host_size_t token_ids_offset = 0; |
| iree_host_size_t offsets_offset = 0; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, IREE_STRUCT_LAYOUT( |
| 0, &total_size, |
| IREE_STRUCT_FIELD(capacity, iree_tokenizer_token_id_t, |
| &token_ids_offset), |
| IREE_STRUCT_FIELD(FLAG_offsets ? capacity : 0, |
| iree_tokenizer_offset_t, &offsets_offset))); |
| uint8_t* storage = NULL; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_allocator_malloc(allocator, total_size, (void**)&storage)); |
| |
| iree_tokenizer_token_id_t* token_ids = |
| (iree_tokenizer_token_id_t*)(storage + token_ids_offset); |
| iree_tokenizer_offset_t* offsets = |
| FLAG_offsets ? (iree_tokenizer_offset_t*)(storage + offsets_offset) |
| : NULL; |
| iree_host_size_t token_count = 0; |
| |
| iree_tokenizer_token_output_t output = |
| iree_tokenizer_make_token_output(token_ids, offsets, NULL, capacity); |
| iree_status_t status = iree_tokenizer_encode(tokenizer, text, flags, output, |
| allocator, &token_count); |
| |
| if (iree_status_is_ok(status)) { |
| // Apply max_length truncation. |
| if (FLAG_max_length > 0 && |
| token_count > (iree_host_size_t)FLAG_max_length) { |
| token_count = (iree_host_size_t)FLAG_max_length; |
| } |
| |
| if (FLAG_json) { |
| iree_tooling_print_json_tokens(token_ids, offsets, token_count); |
| } else { |
| bool first = true; |
| iree_tooling_print_tokens(token_ids, offsets, token_count, &first); |
| fputc('\n', stdout); |
| } |
| } |
| |
| iree_allocator_free(allocator, storage); |
| |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Decode (one-shot with retry) |
| //===----------------------------------------------------------------------===// |
| |
| // Parses comma-separated IDs from a string. |
| static iree_status_t iree_tooling_parse_ids(iree_string_view_t text, |
| iree_tokenizer_token_id_t* out_ids, |
| iree_host_size_t max_ids, |
| iree_host_size_t* out_count) { |
| *out_count = 0; |
| if (text.size == 0) return iree_ok_status(); |
| |
| iree_host_size_t position = 0; |
| while (position < text.size) { |
| // Skip whitespace. |
| while (position < text.size && |
| (text.data[position] == ' ' || text.data[position] == '\t')) { |
| ++position; |
| } |
| if (position >= text.size) break; |
| |
| // Parse number. |
| bool negative = false; |
| if (text.data[position] == '-') { |
| negative = true; |
| ++position; |
| } |
| int32_t value = 0; |
| bool found_digit = false; |
| while (position < text.size && text.data[position] >= '0' && |
| text.data[position] <= '9') { |
| value = value * 10 + (text.data[position] - '0'); |
| found_digit = true; |
| ++position; |
| } |
| if (!found_digit) { |
| return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "expected number at position %zu", position); |
| } |
| if (negative) value = -value; |
| |
| if (*out_count >= max_ids) { |
| return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED, |
| "too many IDs (max %zu)", max_ids); |
| } |
| out_ids[(*out_count)++] = value; |
| |
| // Skip comma. |
| while (position < text.size && |
| (text.data[position] == ' ' || text.data[position] == '\t')) { |
| ++position; |
| } |
| if (position < text.size && text.data[position] == ',') { |
| ++position; |
| } |
| } |
| return iree_ok_status(); |
| } |
| |
| static iree_status_t iree_tooling_tokenize_decode( |
| const iree_tokenizer_t* tokenizer, iree_string_view_t input, |
| iree_allocator_t allocator) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| // Parse IDs into stack buffer. |
| iree_tokenizer_token_id_t ids[8192]; |
| iree_host_size_t id_count = 0; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_tooling_parse_ids(input, ids, IREE_ARRAYSIZE(ids), &id_count)); |
| |
| iree_tokenizer_token_id_list_t tokens = |
| iree_tokenizer_make_token_id_list(ids, id_count); |
| |
| // Decode with retry on RESOURCE_EXHAUSTED. |
| iree_host_size_t text_capacity = 65536; |
| char* text_buffer = NULL; |
| iree_host_size_t text_length = 0; |
| iree_status_t status = iree_ok_status(); |
| |
| for (;;) { |
| status = |
| iree_allocator_malloc(allocator, text_capacity, (void**)&text_buffer); |
| if (!iree_status_is_ok(status)) break; |
| |
| iree_mutable_string_view_t text_output = {text_buffer, text_capacity}; |
| iree_tokenizer_decode_flags_t decode_flags = |
| FLAG_decode_special ? IREE_TOKENIZER_DECODE_FLAG_NONE |
| : IREE_TOKENIZER_DECODE_FLAG_SKIP_SPECIAL_TOKENS; |
| status = iree_tokenizer_decode(tokenizer, tokens, decode_flags, text_output, |
| allocator, &text_length); |
| if (iree_status_is_resource_exhausted(status)) { |
| iree_status_ignore(status); |
| iree_allocator_free(allocator, text_buffer); |
| text_buffer = NULL; |
| text_capacity *= 2; |
| continue; |
| } |
| break; |
| } |
| |
| if (iree_status_is_ok(status)) { |
| if (FLAG_json) { |
| fputs("{\"text\":\"", stdout); |
| iree_tooling_print_text(text_buffer, text_length, /*json_escape=*/true); |
| fputs("\"}\n", stdout); |
| } else { |
| iree_tooling_print_text(text_buffer, text_length, /*json_escape=*/false); |
| fputc('\n', stdout); |
| } |
| } |
| |
| iree_allocator_free(allocator, text_buffer); |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Info |
| //===----------------------------------------------------------------------===// |
| |
| static iree_status_t iree_tooling_tokenize_info( |
| const iree_tokenizer_t* tokenizer) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| const iree_tokenizer_vocab_t* vocab = iree_tokenizer_vocab(tokenizer); |
| iree_host_size_t vocab_size = iree_tokenizer_vocab_capacity(vocab); |
| iree_host_size_t merge_count = iree_tokenizer_vocab_merge_count(vocab); |
| iree_tokenizer_special_ids_t special = |
| iree_tokenizer_vocab_special_ids(vocab); |
| iree_string_view_t model_type = iree_tokenizer_model_type_name(tokenizer); |
| |
| fprintf(stdout, "{\"vocab_size\":%zu,\"model_type\":\"%.*s\"", |
| (size_t)vocab_size, (int)model_type.size, model_type.data); |
| |
| if (merge_count > 0) { |
| fprintf(stdout, ",\"merge_count\":%zu", (size_t)merge_count); |
| } |
| |
| // Special tokens. |
| if (special.bos >= 0) fprintf(stdout, ",\"bos_id\":%" PRId32, special.bos); |
| if (special.eos >= 0) fprintf(stdout, ",\"eos_id\":%" PRId32, special.eos); |
| if (special.unk >= 0) fprintf(stdout, ",\"unk_id\":%" PRId32, special.unk); |
| if (special.pad >= 0) fprintf(stdout, ",\"pad_id\":%" PRId32, special.pad); |
| if (special.cls >= 0) fprintf(stdout, ",\"cls_id\":%" PRId32, special.cls); |
| if (special.sep >= 0) fprintf(stdout, ",\"sep_id\":%" PRId32, special.sep); |
| if (special.mask >= 0) fprintf(stdout, ",\"mask_id\":%" PRId32, special.mask); |
| |
| fputs("}\n", stdout); |
| |
| IREE_TRACE_ZONE_END(z0); |
| return iree_ok_status(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Streaming Stdin Mode |
| //===----------------------------------------------------------------------===// |
| |
| // Streams stdin, reading chunks and emitting tokens incrementally. |
| // Uses the pull-based streaming encode API which handles all boundary |
| // conditions: |
| // - Incomplete UTF-8 sequences at chunk boundaries |
| // - Literals (added_tokens) that span chunks |
| // - Transform segments that span chunks |
| // - BOS/EOS token emission (via postprocessor) |
| static iree_status_t iree_tooling_tokenize_stdin_streaming( |
| const iree_tokenizer_t* tokenizer, iree_allocator_t allocator) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| iree_tokenizer_encode_flags_t flags = iree_tooling_encode_flags(); |
| |
| // Calculate state storage requirements. |
| iree_host_size_t state_size = 0; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_tokenizer_encode_state_calculate_size(tokenizer, &state_size)); |
| |
| // Allocate combined state and transform buffer. |
| iree_host_size_t transform_size = |
| iree_tokenizer_transform_buffer_recommended_size(8192); |
| iree_host_size_t total_size = 0; |
| iree_host_size_t state_offset = 0; |
| iree_host_size_t transform_offset = 0; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, |
| IREE_STRUCT_LAYOUT( |
| 0, &total_size, IREE_STRUCT_FIELD(state_size, uint8_t, &state_offset), |
| IREE_STRUCT_FIELD(transform_size, uint8_t, &transform_offset))); |
| uint8_t* storage = NULL; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_allocator_malloc(allocator, total_size, (void**)&storage)); |
| |
| iree_byte_span_t state_span = {storage + state_offset, state_size}; |
| iree_byte_span_t transform_span = {storage + transform_offset, |
| transform_size}; |
| |
| // Initialize streaming state. |
| iree_tokenizer_encode_state_t* state = NULL; |
| iree_status_t status = iree_tokenizer_encode_state_initialize( |
| tokenizer, state_span, transform_span, |
| iree_tokenizer_offset_run_list_empty(), flags, &state); |
| |
| if (!iree_status_is_ok(status)) { |
| iree_allocator_free(allocator, storage); |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| // Token output buffer (reused each feed call). |
| iree_tokenizer_token_id_t token_buffer[1024]; |
| iree_tokenizer_token_output_t output = iree_tokenizer_make_token_output( |
| token_buffer, NULL, NULL, IREE_ARRAYSIZE(token_buffer)); |
| |
| // Start output. |
| if (FLAG_json) fputs("{\"ids\":[", stdout); |
| bool first_token = true; |
| |
| // Read and feed chunks until EOF. |
| char read_buffer[8192]; |
| size_t bytes_read; |
| while (iree_status_is_ok(status) && |
| (bytes_read = fread(read_buffer, 1, sizeof(read_buffer), stdin)) > 0) { |
| iree_string_view_t chunk = iree_make_string_view(read_buffer, bytes_read); |
| while (chunk.size > 0 && iree_status_is_ok(status)) { |
| iree_host_size_t bytes_consumed = 0; |
| iree_host_size_t token_count = 0; |
| status = iree_tokenizer_encode_state_feed(state, chunk, output, |
| &bytes_consumed, &token_count); |
| if (iree_status_is_ok(status)) { |
| iree_tooling_print_tokens(token_buffer, NULL, token_count, |
| &first_token); |
| chunk.data += bytes_consumed; |
| chunk.size -= bytes_consumed; |
| } |
| } |
| } |
| |
| // Finalize: flush any pending state. |
| if (iree_status_is_ok(status)) { |
| iree_host_size_t token_count = 0; |
| status = iree_tokenizer_encode_state_finalize(state, output, &token_count); |
| if (iree_status_is_ok(status)) { |
| iree_tooling_print_tokens(token_buffer, NULL, token_count, &first_token); |
| } |
| } |
| |
| // Close output. |
| if (FLAG_json) { |
| fputs("]}\n", stdout); |
| } else { |
| fputc('\n', stdout); |
| } |
| |
| iree_tokenizer_encode_state_deinitialize(state); |
| iree_allocator_free(allocator, storage); |
| |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Batch Mode (line-by-line) |
| //===----------------------------------------------------------------------===// |
| |
| // Strips the trailing LF line terminator from a string view. |
| // Only strips \n - does NOT strip \r, which could be content. |
| // The batch protocol uses \n as delimiter (via Python's "\n".join()), so any |
| // \r before the \n is content that must be preserved. |
| static iree_string_view_t iree_tooling_string_view_strip_trailing_newline( |
| iree_string_view_t text) { |
| if (text.size > 0 && text.data[text.size - 1] == '\n') { |
| --text.size; |
| } |
| return text; |
| } |
| |
| // Portable getline implementation that dynamically grows the buffer. |
| // Returns the line length (excluding null terminator), or -1 on EOF/error. |
| // The caller must free *line_ptr using the same allocator when done. |
| static intptr_t iree_tooling_getline(char** line_ptr, |
| iree_host_size_t* capacity_ptr, |
| FILE* stream, iree_allocator_t allocator) { |
| if (*line_ptr == NULL || *capacity_ptr == 0) { |
| *capacity_ptr = 256; |
| iree_status_t status = |
| iree_allocator_malloc(allocator, *capacity_ptr, (void**)line_ptr); |
| if (!iree_status_is_ok(status)) { |
| iree_status_ignore(status); |
| return -1; |
| } |
| } |
| |
| iree_host_size_t position = 0; |
| int character; |
| while ((character = fgetc(stream)) != EOF) { |
| // Grow buffer if needed (leaving room for null terminator). |
| if (position + 1 >= *capacity_ptr) { |
| iree_host_size_t new_capacity = *capacity_ptr * 2; |
| iree_status_t status = |
| iree_allocator_realloc(allocator, new_capacity, (void**)line_ptr); |
| if (!iree_status_is_ok(status)) { |
| iree_status_ignore(status); |
| return -1; |
| } |
| *capacity_ptr = new_capacity; |
| } |
| |
| (*line_ptr)[position++] = (char)character; |
| if (character == '\n') break; |
| } |
| |
| if (position == 0 && character == EOF) return -1; |
| |
| (*line_ptr)[position] = '\0'; |
| return (intptr_t)position; |
| } |
| |
| static iree_status_t iree_tooling_tokenize_batch( |
| const iree_tokenizer_t* tokenizer, iree_allocator_t allocator) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| char* line = NULL; |
| iree_host_size_t line_capacity = 0; |
| intptr_t line_length; |
| |
| while ((line_length = iree_tooling_getline(&line, &line_capacity, stdin, |
| allocator)) != -1) { |
| iree_string_view_t text = iree_tooling_string_view_strip_trailing_newline( |
| iree_make_string_view(line, (iree_host_size_t)line_length)); |
| iree_status_t status; |
| if (FLAG_decode) { |
| status = iree_tooling_tokenize_decode(tokenizer, text, allocator); |
| } else { |
| status = iree_tooling_tokenize_encode(tokenizer, text, allocator); |
| } |
| if (!iree_status_is_ok(status)) { |
| iree_allocator_free(allocator, line); |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| } |
| |
| iree_allocator_free(allocator, line); |
| IREE_TRACE_ZONE_END(z0); |
| return iree_ok_status(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Benchmark Mode |
| //===----------------------------------------------------------------------===// |
| |
| typedef struct { |
| iree_time_t min_ns; |
| iree_time_t max_ns; |
| iree_time_t total_ns; |
| int32_t iterations; |
| iree_host_size_t total_input_bytes; |
| iree_host_size_t total_tokens; |
| iree_host_size_t peak_memory; |
| } iree_tooling_benchmark_stats_t; |
| |
| static void iree_tooling_benchmark_stats_initialize( |
| iree_tooling_benchmark_stats_t* stats) { |
| memset(stats, 0, sizeof(*stats)); |
| stats->min_ns = INT64_MAX; |
| } |
| |
| static void iree_tooling_benchmark_stats_record( |
| iree_tooling_benchmark_stats_t* stats, iree_time_t elapsed_ns, |
| iree_host_size_t input_bytes, iree_host_size_t tokens) { |
| if (elapsed_ns < stats->min_ns) stats->min_ns = elapsed_ns; |
| if (elapsed_ns > stats->max_ns) stats->max_ns = elapsed_ns; |
| stats->total_ns += elapsed_ns; |
| stats->iterations++; |
| stats->total_input_bytes += input_bytes; |
| stats->total_tokens += tokens; |
| } |
| |
| static void iree_tooling_benchmark_stats_print( |
| const iree_tooling_benchmark_stats_t* stats, const char* mode) { |
| iree_time_t average_ns = stats->total_ns / stats->iterations; |
| double tokens_per_sec = |
| (double)stats->total_tokens / ((double)stats->total_ns / 1e9); |
| double mb_per_sec = |
| (double)stats->total_input_bytes / ((double)stats->total_ns / 1e9) / 1e6; |
| |
| if (FLAG_json) { |
| fprintf(stdout, |
| "{\"mode\":\"%s\",\"iterations\":%d," |
| "\"total_input_bytes\":%zu,\"total_tokens\":%zu," |
| "\"min_ns\":%" PRId64 ",\"avg_ns\":%" PRId64 ",\"max_ns\":%" PRId64 |
| "," |
| "\"tokens_per_sec\":%.1f,\"mb_per_sec\":%.3f," |
| "\"peak_memory_bytes\":%zu}\n", |
| mode, stats->iterations, (size_t)stats->total_input_bytes, |
| (size_t)stats->total_tokens, stats->min_ns, average_ns, |
| stats->max_ns, tokens_per_sec, mb_per_sec, |
| (size_t)stats->peak_memory); |
| } else { |
| fprintf(stderr, |
| "Benchmark: %s\n" |
| " Iterations: %d\n" |
| " Input bytes: %zu total\n" |
| " Tokens: %zu total\n" |
| " Latency (ns): min=%" PRId64 " avg=%" PRId64 " max=%" PRId64 |
| "\n" |
| " Throughput: %.1f tokens/sec, %.3f MB/sec\n" |
| " Peak memory: %zu bytes\n", |
| mode, stats->iterations, (size_t)stats->total_input_bytes, |
| (size_t)stats->total_tokens, stats->min_ns, average_ns, |
| stats->max_ns, tokens_per_sec, mb_per_sec, |
| (size_t)stats->peak_memory); |
| } |
| } |
| |
| static iree_status_t iree_tooling_benchmark_oneshot( |
| const iree_tokenizer_t* tokenizer, iree_string_view_t text, |
| iree_allocator_t allocator) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| iree_tokenizer_encode_flags_t flags = iree_tooling_encode_flags(); |
| |
| // Allocate output buffer sized to text length (generous). |
| iree_host_size_t capacity = iree_max(text.size, (iree_host_size_t)8192); |
| iree_tokenizer_token_id_t* token_ids = NULL; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_allocator_malloc(allocator, |
| capacity * sizeof(iree_tokenizer_token_id_t), |
| (void**)&token_ids)); |
| |
| iree_tokenizer_token_output_t output = |
| iree_tokenizer_make_token_output(token_ids, NULL, NULL, capacity); |
| |
| iree_tooling_benchmark_stats_t stats; |
| iree_tooling_benchmark_stats_initialize(&stats); |
| stats.peak_memory = capacity * sizeof(iree_tokenizer_token_id_t); |
| |
| // Warmup. |
| for (int32_t i = 0; i < FLAG_benchmark_warmup; ++i) { |
| iree_host_size_t token_count = 0; |
| iree_status_t status = iree_tokenizer_encode(tokenizer, text, flags, output, |
| allocator, &token_count); |
| if (!iree_status_is_ok(status)) { |
| iree_allocator_free(allocator, token_ids); |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| } |
| |
| // Timed iterations. |
| for (int32_t i = 0; i < FLAG_benchmark_iterations; ++i) { |
| iree_host_size_t token_count = 0; |
| iree_time_t start = iree_time_now(); |
| iree_status_t status = iree_tokenizer_encode(tokenizer, text, flags, output, |
| allocator, &token_count); |
| iree_time_t end = iree_time_now(); |
| if (!iree_status_is_ok(status)) { |
| iree_allocator_free(allocator, token_ids); |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| iree_tooling_benchmark_stats_record(&stats, end - start, text.size, |
| token_count); |
| } |
| |
| iree_tooling_benchmark_stats_print(&stats, "oneshot"); |
| iree_allocator_free(allocator, token_ids); |
| IREE_TRACE_ZONE_END(z0); |
| return iree_ok_status(); |
| } |
| |
| static iree_status_t iree_tooling_benchmark_stream( |
| const iree_tokenizer_t* tokenizer, iree_string_view_t text, |
| iree_allocator_t allocator) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| iree_tokenizer_encode_flags_t flags = iree_tooling_encode_flags(); |
| iree_host_size_t chunk_size = (iree_host_size_t)FLAG_benchmark_chunk_size; |
| |
| // Allocate combined state and transform buffer. |
| iree_host_size_t state_size = 0; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_tokenizer_encode_state_calculate_size(tokenizer, &state_size)); |
| iree_host_size_t transform_size = |
| iree_tokenizer_transform_buffer_recommended_size(chunk_size); |
| iree_host_size_t total_size = 0; |
| iree_host_size_t state_offset = 0; |
| iree_host_size_t transform_offset = 0; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, |
| IREE_STRUCT_LAYOUT( |
| 0, &total_size, IREE_STRUCT_FIELD(state_size, uint8_t, &state_offset), |
| IREE_STRUCT_FIELD(transform_size, uint8_t, &transform_offset))); |
| uint8_t* storage = NULL; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_allocator_malloc(allocator, total_size, (void**)&storage)); |
| |
| // Token output buffer. |
| iree_tokenizer_token_id_t token_buffer[1024]; |
| iree_tokenizer_token_output_t output = iree_tokenizer_make_token_output( |
| token_buffer, NULL, NULL, IREE_ARRAYSIZE(token_buffer)); |
| |
| iree_byte_span_t state_span = {storage + state_offset, state_size}; |
| iree_byte_span_t transform_span = {storage + transform_offset, |
| transform_size}; |
| |
| iree_tooling_benchmark_stats_t stats; |
| iree_tooling_benchmark_stats_initialize(&stats); |
| stats.peak_memory = total_size + sizeof(token_buffer); |
| |
| iree_status_t status = iree_ok_status(); |
| int32_t total_iterations = FLAG_benchmark_warmup + FLAG_benchmark_iterations; |
| for (int32_t iteration = 0; |
| iteration < total_iterations && iree_status_is_ok(status); ++iteration) { |
| bool is_warmup = (iteration < FLAG_benchmark_warmup); |
| iree_time_t start = iree_time_now(); |
| iree_host_size_t iteration_tokens = 0; |
| |
| // Initialize state for this iteration. |
| iree_tokenizer_encode_state_t* state = NULL; |
| status = iree_tokenizer_encode_state_initialize( |
| tokenizer, state_span, transform_span, |
| iree_tokenizer_offset_run_list_empty(), flags, &state); |
| if (!iree_status_is_ok(status)) break; |
| |
| // Feed text in chunks. |
| iree_host_size_t text_position = 0; |
| while (text_position < text.size && iree_status_is_ok(status)) { |
| iree_host_size_t remaining = text.size - text_position; |
| iree_host_size_t this_chunk = iree_min(remaining, chunk_size); |
| iree_string_view_t chunk = |
| iree_make_string_view(text.data + text_position, this_chunk); |
| while (chunk.size > 0 && iree_status_is_ok(status)) { |
| iree_host_size_t bytes_consumed = 0; |
| iree_host_size_t token_count = 0; |
| status = iree_tokenizer_encode_state_feed( |
| state, chunk, output, &bytes_consumed, &token_count); |
| if (iree_status_is_ok(status)) { |
| iteration_tokens += token_count; |
| chunk.data += bytes_consumed; |
| chunk.size -= bytes_consumed; |
| } |
| } |
| text_position += this_chunk; |
| } |
| |
| // Finalize. |
| if (iree_status_is_ok(status)) { |
| iree_host_size_t token_count = 0; |
| status = |
| iree_tokenizer_encode_state_finalize(state, output, &token_count); |
| if (iree_status_is_ok(status)) { |
| iteration_tokens += token_count; |
| } |
| } |
| |
| iree_tokenizer_encode_state_deinitialize(state); |
| |
| if (iree_status_is_ok(status) && !is_warmup) { |
| iree_time_t end = iree_time_now(); |
| iree_tooling_benchmark_stats_record(&stats, end - start, text.size, |
| iteration_tokens); |
| } |
| } |
| |
| if (iree_status_is_ok(status)) { |
| iree_tooling_benchmark_stats_print(&stats, "stream"); |
| } |
| |
| iree_allocator_free(allocator, storage); |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| static iree_status_t iree_tooling_benchmark_decode( |
| const iree_tokenizer_t* tokenizer, iree_string_view_t text, |
| iree_allocator_t allocator) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| // First, encode the text to get tokens. |
| iree_tokenizer_encode_flags_t flags = iree_tooling_encode_flags(); |
| iree_host_size_t capacity = iree_max(text.size, (iree_host_size_t)8192); |
| iree_tokenizer_token_id_t* token_ids = NULL; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_allocator_malloc(allocator, |
| capacity * sizeof(iree_tokenizer_token_id_t), |
| (void**)&token_ids)); |
| |
| iree_tokenizer_token_output_t output = |
| iree_tokenizer_make_token_output(token_ids, NULL, NULL, capacity); |
| iree_host_size_t token_count = 0; |
| iree_status_t status = iree_tokenizer_encode(tokenizer, text, flags, output, |
| allocator, &token_count); |
| if (!iree_status_is_ok(status)) { |
| iree_allocator_free(allocator, token_ids); |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| iree_tokenizer_token_id_list_t tokens = |
| iree_tokenizer_make_token_id_list(token_ids, token_count); |
| |
| // Allocate decode output buffer. |
| iree_host_size_t text_capacity = 65536; |
| char* text_buffer = NULL; |
| status = |
| iree_allocator_malloc(allocator, text_capacity, (void**)&text_buffer); |
| if (!iree_status_is_ok(status)) { |
| iree_allocator_free(allocator, token_ids); |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| iree_mutable_string_view_t text_output = {text_buffer, text_capacity}; |
| |
| iree_tooling_benchmark_stats_t stats; |
| iree_tooling_benchmark_stats_initialize(&stats); |
| stats.peak_memory = |
| capacity * sizeof(iree_tokenizer_token_id_t) + text_capacity; |
| |
| // Decode flags for benchmark iterations. |
| iree_tokenizer_decode_flags_t decode_flags = |
| FLAG_decode_special ? IREE_TOKENIZER_DECODE_FLAG_NONE |
| : IREE_TOKENIZER_DECODE_FLAG_SKIP_SPECIAL_TOKENS; |
| |
| // Warmup. |
| for (int32_t i = 0; i < FLAG_benchmark_warmup && iree_status_is_ok(status); |
| ++i) { |
| iree_host_size_t text_length = 0; |
| status = iree_tokenizer_decode(tokenizer, tokens, decode_flags, text_output, |
| allocator, &text_length); |
| } |
| |
| // Timed iterations. |
| for (int32_t i = 0; |
| i < FLAG_benchmark_iterations && iree_status_is_ok(status); ++i) { |
| iree_host_size_t text_length = 0; |
| iree_time_t start = iree_time_now(); |
| status = iree_tokenizer_decode(tokenizer, tokens, decode_flags, text_output, |
| allocator, &text_length); |
| iree_time_t end = iree_time_now(); |
| if (iree_status_is_ok(status)) { |
| iree_tooling_benchmark_stats_record(&stats, end - start, text_length, |
| token_count); |
| } |
| } |
| |
| if (iree_status_is_ok(status)) { |
| iree_tooling_benchmark_stats_print(&stats, "decode"); |
| } |
| |
| iree_allocator_free(allocator, text_buffer); |
| iree_allocator_free(allocator, token_ids); |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| static iree_status_t iree_tooling_tokenize_benchmark( |
| const iree_tokenizer_t* tokenizer, iree_string_view_t text, |
| iree_allocator_t allocator) { |
| iree_string_view_t mode = iree_make_cstring_view(FLAG_benchmark); |
| |
| if (iree_string_view_equal(mode, IREE_SV("oneshot"))) { |
| return iree_tooling_benchmark_oneshot(tokenizer, text, allocator); |
| } else if (iree_string_view_equal(mode, IREE_SV("stream"))) { |
| return iree_tooling_benchmark_stream(tokenizer, text, allocator); |
| } else if (iree_string_view_equal(mode, IREE_SV("decode"))) { |
| return iree_tooling_benchmark_decode(tokenizer, text, allocator); |
| } |
| |
| return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "unknown benchmark mode '%.*s' " |
| "(expected: oneshot, stream, decode)", |
| (int)mode.size, mode.data); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // JSON String Processing |
| //===----------------------------------------------------------------------===// |
| |
| // Decodes a JSON-escaped string into raw UTF-8 bytes. |
| static iree_status_t iree_tooling_decode_json_string( |
| iree_string_view_t raw_input, iree_allocator_t allocator, char** out_buffer, |
| iree_string_view_t* out_text) { |
| // Strip surrounding quotes if present. |
| iree_string_view_t escaped = raw_input; |
| if (escaped.size >= 2 && escaped.data[0] == '"' && |
| escaped.data[escaped.size - 1] == '"') { |
| escaped = iree_string_view_substr(escaped, 1, escaped.size - 2); |
| } |
| |
| // First pass: compute required size. |
| iree_host_size_t decoded_length = 0; |
| IREE_RETURN_IF_ERROR( |
| iree_json_unescape_string(escaped, 0, NULL, &decoded_length)); |
| |
| // Allocate and decode. |
| char* buffer = NULL; |
| IREE_RETURN_IF_ERROR( |
| iree_allocator_malloc(allocator, decoded_length + 1, (void**)&buffer)); |
| iree_status_t status = iree_json_unescape_string(escaped, decoded_length + 1, |
| buffer, &decoded_length); |
| if (!iree_status_is_ok(status)) { |
| iree_allocator_free(allocator, buffer); |
| return status; |
| } |
| buffer[decoded_length] = '\0'; |
| |
| *out_buffer = buffer; |
| *out_text = iree_make_string_view(buffer, decoded_length); |
| return iree_ok_status(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Main |
| //===----------------------------------------------------------------------===// |
| |
| int main(int argc, char** argv) { |
| IREE_TRACE_APP_ENTER(); |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| iree_allocator_t host_allocator = iree_allocator_system(); |
| int exit_code = EXIT_SUCCESS; |
| |
| iree_flags_set_usage( |
| "iree-tokenize", |
| "Tokenizes text using HuggingFace tokenizer.json files.\n" |
| "Outputs comma-separated token IDs (use --json for JSON format).\n" |
| "\n" |
| "Usage:\n" |
| " iree-tokenize --tokenizer=<file> [flags] <text>\n" |
| "\n" |
| "Examples:\n" |
| "\n" |
| " Encode text to token IDs (default: comma-separated):\n" |
| " iree-tokenize --tokenizer=tokenizer.json \"hello, world!\"\n" |
| " 101,7592,1010,2088,999,102\n" |
| "\n" |
| " JSON output:\n" |
| " iree-tokenize --tokenizer=tokenizer.json --json \"hello, world!\"\n" |
| " {\"ids\":[101,7592,1010,2088,999,102]}\n" |
| "\n" |
| " Use with iree-run-module:\n" |
| " iree-run-module --module=model.vmfb \\\n" |
| " --input=\"6xi32=$(iree-tokenize --tokenizer=tokenizer.json " |
| "'hello')\"\n" |
| "\n" |
| " Encode without special tokens (no [CLS]/[SEP] or BOS/EOS):\n" |
| " iree-tokenize --tokenizer=tokenizer.json --special=false \"hello " |
| "world\"\n" |
| " 7592,2088\n" |
| "\n" |
| " Show token-to-byte offset mappings:\n" |
| " iree-tokenize --tokenizer=tokenizer.json --offsets \"hello world\"\n" |
| " 7592[0:5],2088[6:11]\n" |
| "\n" |
| " Decode token IDs back to text:\n" |
| " iree-tokenize --tokenizer=tokenizer.json --decode " |
| "\"101,7592,2088,102\"\n" |
| " [CLS]helloworld[SEP]\n" |
| "\n" |
| " Show tokenizer info (always JSON):\n" |
| " iree-tokenize --tokenizer=tokenizer.json --info\n" |
| " {\"vocab_size\":30522,\"model_type\":\"BPE\",\"unk_id\":100," |
| "\"cls_id\":101,\"sep_id\":102}\n" |
| "\n" |
| " Batch mode - encode one line per input from stdin:\n" |
| " echo -e \"hello\\nworld\" | iree-tokenize " |
| "--tokenizer=tokenizer.json --batch\n" |
| " 101,7592,102\n" |
| " 101,2088,102\n" |
| "\n" |
| " Stream mode - continuous stdin encoding (no line buffering):\n" |
| " cat large_file.txt | iree-tokenize --tokenizer=tokenizer.json " |
| "--stream\n" |
| " 101,7592,...\n" |
| "\n" |
| " Truncate output to max length:\n" |
| " iree-tokenize --tokenizer=tokenizer.json --max_length=5 \"hello " |
| "world foo\"\n" |
| " 101,7592,2088,29379,102\n" |
| "\n" |
| " Benchmark encode throughput:\n" |
| " iree-tokenize --tokenizer=tokenizer.json --benchmark=oneshot " |
| "\"hello world\"\n" |
| "\n" |
| " JSON output with jq:\n" |
| " iree-tokenize --tokenizer=tokenizer.json --json \"hello\" | jq " |
| "'.ids'\n" |
| " [101,7592,102]\n"); |
| iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv); |
| |
| if (FLAG_tokenizer[0] == '\0') { |
| fprintf(stderr, |
| "Error: missing --tokenizer=<file> flag\n" |
| "Usage: iree-tokenize --tokenizer=<file> [flags] <text>\n" |
| "Run with --help for more information.\n"); |
| IREE_TRACE_ZONE_END(z0); |
| IREE_TRACE_APP_EXIT(EXIT_FAILURE); |
| return EXIT_FAILURE; |
| } |
| |
| iree_string_view_t tokenizer_path = iree_make_cstring_view(FLAG_tokenizer); |
| |
| // Load tokenizer file. |
| iree_io_file_contents_t* file_contents = NULL; |
| iree_status_t status = iree_io_file_contents_map( |
| tokenizer_path, IREE_IO_FILE_ACCESS_READ, host_allocator, &file_contents); |
| |
| // Create tokenizer from either HuggingFace JSON or tiktoken format. |
| iree_tokenizer_t* tokenizer = NULL; |
| if (iree_status_is_ok(status)) { |
| iree_string_view_t file_data = |
| iree_make_string_view((const char*)file_contents->const_buffer.data, |
| file_contents->const_buffer.data_length); |
| |
| if (iree_string_view_ends_with(tokenizer_path, IREE_SV(".tiktoken"))) { |
| // Tiktoken format: resolve encoding config from --encoding flag or |
| // infer from filename (e.g., "cl100k_base.tiktoken" -> "cl100k_base"). |
| const iree_tokenizer_tiktoken_config_t* config = NULL; |
| iree_string_view_t encoding = iree_make_cstring_view(FLAG_encoding); |
| if (iree_string_view_is_empty(encoding)) { |
| // Extract basename: find last '/' (or '\' on Windows). |
| iree_host_size_t last_sep = iree_string_view_find_last_of( |
| tokenizer_path, IREE_SV("/\\"), IREE_STRING_VIEW_NPOS); |
| encoding = (last_sep != IREE_STRING_VIEW_NPOS) |
| ? iree_string_view_substr(tokenizer_path, last_sep + 1, |
| IREE_HOST_SIZE_MAX) |
| : tokenizer_path; |
| iree_string_view_consume_suffix(&encoding, IREE_SV(".tiktoken")); |
| } |
| if (iree_string_view_equal(encoding, IREE_SV("cl100k_base"))) { |
| config = iree_tokenizer_tiktoken_config_cl100k_base(); |
| } else if (iree_string_view_equal(encoding, IREE_SV("o200k_base"))) { |
| config = iree_tokenizer_tiktoken_config_o200k_base(); |
| } else if (iree_string_view_equal(encoding, IREE_SV("r50k_base"))) { |
| config = iree_tokenizer_tiktoken_config_r50k_base(); |
| } else if (iree_string_view_equal(encoding, IREE_SV("p50k_base"))) { |
| config = iree_tokenizer_tiktoken_config_p50k_base(); |
| } else { |
| status = iree_make_status( |
| IREE_STATUS_INVALID_ARGUMENT, |
| "unknown tiktoken encoding '%.*s'; use --encoding with one of: " |
| "cl100k_base, o200k_base, r50k_base, p50k_base", |
| (int)encoding.size, encoding.data); |
| } |
| if (iree_status_is_ok(status)) { |
| status = iree_tokenizer_from_tiktoken(file_data, config, host_allocator, |
| &tokenizer); |
| } |
| } else { |
| // Default: HuggingFace JSON format. |
| status = iree_tokenizer_from_huggingface_json(file_data, host_allocator, |
| &tokenizer); |
| } |
| } |
| |
| // Validate flag combinations. |
| if (iree_status_is_ok(status) && FLAG_json_string) { |
| if (FLAG_batch || FLAG_stream) { |
| fprintf(stderr, |
| "Error: --json_string is not supported with --batch/--stream\n" |
| "(file/stdin input preserves UTF-8; use --json_string only for " |
| "command-line arguments)\n"); |
| status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "--json_string requires single input mode"); |
| } else if (FLAG_decode) { |
| fprintf(stderr, |
| "Error: --json_string is not supported with --decode\n" |
| "(decode takes numeric IDs, not text)\n"); |
| status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "--json_string requires encode mode"); |
| } |
| } |
| |
| // Process based on mode. |
| if (iree_status_is_ok(status)) { |
| if (FLAG_info) { |
| status = iree_tooling_tokenize_info(tokenizer); |
| } else if (FLAG_benchmark[0] != '\0') { |
| // Benchmark mode requires text input. |
| if (argc < 2) { |
| fprintf(stderr, "Error: --benchmark requires input text argument\n"); |
| status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "--benchmark requires input text"); |
| } else { |
| iree_string_view_t input = iree_make_cstring_view(argv[1]); |
| char* decoded_buffer = NULL; |
| if (FLAG_json_string) { |
| status = iree_tooling_decode_json_string(input, host_allocator, |
| &decoded_buffer, &input); |
| } |
| if (iree_status_is_ok(status)) { |
| status = |
| iree_tooling_tokenize_benchmark(tokenizer, input, host_allocator); |
| } |
| iree_allocator_free(host_allocator, decoded_buffer); |
| } |
| } else if (FLAG_stream) { |
| if (FLAG_decode) { |
| fprintf(stderr, "Error: --stream is not supported with --decode\n"); |
| status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "--stream requires encode mode"); |
| } else { |
| status = |
| iree_tooling_tokenize_stdin_streaming(tokenizer, host_allocator); |
| } |
| } else if (FLAG_batch) { |
| status = iree_tooling_tokenize_batch(tokenizer, host_allocator); |
| } else if (argc < 2) { |
| fprintf(stderr, |
| "Error: missing input text\n" |
| "Usage: iree-tokenize --tokenizer=<file> [flags] <text>\n"); |
| status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "missing input"); |
| } else { |
| iree_string_view_t input = iree_make_cstring_view(argv[1]); |
| char* decoded_buffer = NULL; |
| if (FLAG_json_string) { |
| status = iree_tooling_decode_json_string(input, host_allocator, |
| &decoded_buffer, &input); |
| } |
| if (iree_status_is_ok(status)) { |
| if (FLAG_decode) { |
| status = |
| iree_tooling_tokenize_decode(tokenizer, input, host_allocator); |
| } else { |
| status = |
| iree_tooling_tokenize_encode(tokenizer, input, host_allocator); |
| } |
| } |
| iree_allocator_free(host_allocator, decoded_buffer); |
| } |
| } |
| |
| // Cleanup. |
| if (tokenizer) iree_tokenizer_free(tokenizer); |
| if (file_contents) iree_io_file_contents_free(file_contents); |
| |
| fflush(stdout); |
| if (!iree_status_is_ok(status)) { |
| iree_status_fprint(stderr, status); |
| iree_status_free(status); |
| exit_code = EXIT_FAILURE; |
| } |
| fflush(stderr); |
| |
| IREE_TRACE_ZONE_END(z0); |
| IREE_TRACE_APP_EXIT(exit_code); |
| return exit_code; |
| } |