| // RUN: iree-tokenize %p/iree-tokenize.json "hello world" | \ |
| // RUN: FileCheck %s --check-prefix=BASIC |
| // RUN: iree-tokenize %p/iree-tokenize.json --no_special "hello" | \ |
| // RUN: FileCheck %s --check-prefix=NOSPEC |
| // Unicode test: 日本語テスト (Japanese: "Japanese language test") |
| // Uses --json_string with \u escapes for Windows portability. |
| // RUN: iree-tokenize %p/iree-tokenize.json --json_string "\u65e5\u672c\u8a9e\u30c6\u30b9\u30c8" | \ |
| // RUN: FileCheck %s --check-prefix=UNICODE |
| // RUN: iree-tokenize %p/iree-tokenize.json --info | \ |
| // RUN: FileCheck %s --check-prefix=INFO |
| // RUN: echo "hello" | iree-tokenize %p/iree-tokenize.json --batch | \ |
| // RUN: FileCheck %s --check-prefix=BATCH |
| // RUN: iree-tokenize %p/iree-tokenize.json --decode "0,5,6,1" | \ |
| // RUN: FileCheck %s --check-prefix=DECODE |
| // RUN: iree-tokenize %p/iree-tokenize.json --raw "hello world" | \ |
| // RUN: FileCheck %s --check-prefix=RAW |
| // RUN: iree-tokenize %p/iree-tokenize.json --raw --no_special "hello" | \ |
| // RUN: FileCheck %s --check-prefix=RAW-NOSPEC |
| |
| // Basic encoding with special tokens. |
| // BASIC: {"ids":[0,5,6,1]} |
| |
| // Encoding without special tokens. |
| // NOSPEC: {"ids":[5]} |
| |
| // Unicode (Japanese) encoding: CJK chars split individually, katakana as word. |
| // UNICODE: {"ids":[0,9,10,11,12,1]} |
| |
| // Tokenizer info. |
| // INFO: "vocab_size":13 |
| // INFO: "model_type":"WordPiece" |
| |
| // Batch mode from stdin. |
| // BATCH: {"ids":[0,5,1]} |
| |
| // Decoding back to text. |
| // DECODE: {"text":"[CLS] hello world [SEP]"} |
| |
| // Raw output (comma-separated IDs, no JSON wrapper). |
| // RAW: 0,5,6,1 |
| |
| // Raw output without special tokens. |
| // RAW-NOSPEC: 5 |