blob: 8252c09187745b0a860f52bcf422ff4975cdb55d [file] [log] [blame]
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.json "hello world" | \
// RUN: FileCheck %s --check-prefix=BASIC
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.json --special=false "hello" | \
// RUN: FileCheck %s --check-prefix=NOSPEC
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.json --info | \
// RUN: FileCheck %s --check-prefix=INFO
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.json --decode "0,19,21,1" | \
// RUN: FileCheck %s --check-prefix=DECODE
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.json --decode --decode_special "0,19,21,1" | \
// RUN: FileCheck %s --check-prefix=DECODE-SPECIAL
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.json --json "hello world" | \
// RUN: FileCheck %s --check-prefix=JSON
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.json --json --special=false "hello" | \
// RUN: FileCheck %s --check-prefix=JSON-NOSPEC
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.json --offsets "hello world" | \
// RUN: FileCheck %s --check-prefix=OFFSETS
// RUN: iree-tokenize --tokenizer=%p/iree-tokenize.json --offsets --json "hello world" | \
// RUN: FileCheck %s --check-prefix=OFFSETS-JSON
// Basic encoding with special tokens (BPE: <s> hello Ġworld </s>).
// BASIC: 0,19,21,1
// Encoding without special tokens (BPE: hello).
// NOSPEC: 19
// Tokenizer info (always JSON).
// INFO: "vocab_size":22
// INFO: "model_type":"BPE"
// INFO: "merge_count":9
// Decoding back to text (default: skip special tokens).
// DECODE: hello world
// Decoding with special tokens included (--decode_special).
// DECODE-SPECIAL: <s>hello world</s>
// JSON output with --json flag.
// JSON: {"ids":[0,19,21,1]}
// JSON output without special tokens.
// JSON-NOSPEC: {"ids":[19]}
// Offsets: special tokens get [0:0], model tokens get byte ranges.
// OFFSETS: 0[0:0],19[0:5],21[5:11],1[0:0]
// JSON offsets output (split to avoid FileCheck [[regex-var]] parsing).
// OFFSETS-JSON: {"ids":[0,19,21,1],"offsets":[
// OFFSETS-JSON-SAME: [0,0],[0,5],[5,11],[0,0]]}