blob: 671aa599a8aae9daf24dff2da671ad5eb717597e [file]
// RUN: iree-tokenize %p/iree-tokenize.json "hello world" | \
// RUN: FileCheck %s --check-prefix=BASIC
// RUN: iree-tokenize %p/iree-tokenize.json --no_special "hello" | \
// RUN: FileCheck %s --check-prefix=NOSPEC
// Unicode test: 日本語テスト (Japanese: "Japanese language test")
// Uses --json_string with \u escapes for Windows portability.
// RUN: iree-tokenize %p/iree-tokenize.json --json_string "\u65e5\u672c\u8a9e\u30c6\u30b9\u30c8" | \
// RUN: FileCheck %s --check-prefix=UNICODE
// RUN: iree-tokenize %p/iree-tokenize.json --info | \
// RUN: FileCheck %s --check-prefix=INFO
// RUN: echo "hello" | iree-tokenize %p/iree-tokenize.json --batch | \
// RUN: FileCheck %s --check-prefix=BATCH
// RUN: iree-tokenize %p/iree-tokenize.json --decode "0,5,6,1" | \
// RUN: FileCheck %s --check-prefix=DECODE
// RUN: iree-tokenize %p/iree-tokenize.json --raw "hello world" | \
// RUN: FileCheck %s --check-prefix=RAW
// RUN: iree-tokenize %p/iree-tokenize.json --raw --no_special "hello" | \
// RUN: FileCheck %s --check-prefix=RAW-NOSPEC
// Basic encoding with special tokens.
// BASIC: {"ids":[0,5,6,1]}
// Encoding without special tokens.
// NOSPEC: {"ids":[5]}
// Unicode (Japanese) encoding: CJK chars split individually, katakana as word.
// UNICODE: {"ids":[0,9,10,11,12,1]}
// Tokenizer info.
// INFO: "vocab_size":13
// INFO: "model_type":"WordPiece"
// Batch mode from stdin.
// BATCH: {"ids":[0,5,1]}
// Decoding back to text.
// DECODE: {"text":"[CLS] hello world [SEP]"}
// Raw output (comma-separated IDs, no JSON wrapper).
// RAW: 0,5,6,1
// Raw output without special tokens.
// RAW-NOSPEC: 5