blob: 499eac439fa266fb0e30b38b0542917a34139cdf [file] [edit]
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.tiktoken --encoding=cl100k_base "in" | \
// RUN: FileCheck %s --check-prefix=BASIC
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.tiktoken --encoding=cl100k_base --info | \
// RUN: FileCheck %s --check-prefix=INFO
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.tiktoken --encoding=cl100k_base --decode "256" | \
// RUN: FileCheck %s --check-prefix=DECODE
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.tiktoken --encoding=cl100k_base --json "in" | \
// RUN: FileCheck %s --check-prefix=JSON
// Basic encoding: "in" merges to rank 256.
// BASIC: 256
// Tokenizer info (minimal vocab + cl100k_base special tokens).
// INFO: "model_type":"BPE"
// INFO: "merge_count":4
// Decode token 256 back to "in".
// DECODE: in
// JSON output for encoding "in".
// JSON: {"ids":[256]}