tools/test/iree-tokenize.json - 3p/openxla/iree - Git at Google

 {
   "model": {
     "type": "WordPiece",
     "vocab": {
       "[CLS]": 0, "[SEP]": 1, "[UNK]": 2, "[PAD]": 3, "[MASK]": 4,
       "hello": 5, "world": 6, "test": 7, "##ing": 8,
       "日": 9, "本": 10, "語": 11, "テスト": 12
     },
     "unk_token": "[UNK]",
     "continuing_subword_prefix": "##"
   },
   "added_tokens": [
     {"id": 0, "content": "[CLS]", "special": true},
     {"id": 1, "content": "[SEP]", "special": true},
     {"id": 2, "content": "[UNK]", "special": true},
     {"id": 3, "content": "[PAD]", "special": true},
     {"id": 4, "content": "[MASK]", "special": true}
   ],
   "pre_tokenizer": {
     "type": "BertPreTokenizer"
   },
   "decoder": {
     "type": "WordPiece",
     "prefix": "##",
     "cleanup": true
   },
   "post_processor": {
     "type": "TemplateProcessing",
     "single": [
       {"SpecialToken": {"id": "[CLS]", "type_id": 0}},
       {"Sequence": {"id": "A", "type_id": 0}},
       {"SpecialToken": {"id": "[SEP]", "type_id": 0}}
     ],
     "pair": [
       {"SpecialToken": {"id": "[CLS]", "type_id": 0}},
       {"Sequence": {"id": "A", "type_id": 0}},
       {"SpecialToken": {"id": "[SEP]", "type_id": 0}},
       {"Sequence": {"id": "B", "type_id": 1}},
       {"SpecialToken": {"id": "[SEP]", "type_id": 1}}
     ],
     "special_tokens": {
       "[CLS]": {"id": "[CLS]", "ids": [0], "tokens": ["[CLS]"]},
       "[SEP]": {"id": "[SEP]", "ids": [1], "tokens": ["[SEP]"]}
     }
   }
 }
	{
	"model": {
	"type": "WordPiece",
	"vocab": {
	"[CLS]": 0, "[SEP]": 1, "[UNK]": 2, "[PAD]": 3, "[MASK]": 4,
	"hello": 5, "world": 6, "test": 7, "##ing": 8,
	"日": 9, "本": 10, "語": 11, "テスト": 12
	},
	"unk_token": "[UNK]",
	"continuing_subword_prefix": "##"
	},
	"added_tokens": [
	{"id": 0, "content": "[CLS]", "special": true},
	{"id": 1, "content": "[SEP]", "special": true},
	{"id": 2, "content": "[UNK]", "special": true},
	{"id": 3, "content": "[PAD]", "special": true},
	{"id": 4, "content": "[MASK]", "special": true}
	],
	"pre_tokenizer": {
	"type": "BertPreTokenizer"
	},
	"decoder": {
	"type": "WordPiece",
	"prefix": "##",
	"cleanup": true
	},
	"post_processor": {
	"type": "TemplateProcessing",
	"single": [
	{"SpecialToken": {"id": "[CLS]", "type_id": 0}},
	{"Sequence": {"id": "A", "type_id": 0}},
	{"SpecialToken": {"id": "[SEP]", "type_id": 0}}
	],
	"pair": [
	{"SpecialToken": {"id": "[CLS]", "type_id": 0}},
	{"Sequence": {"id": "A", "type_id": 0}},
	{"SpecialToken": {"id": "[SEP]", "type_id": 0}},
	{"Sequence": {"id": "B", "type_id": 1}},
	{"SpecialToken": {"id": "[SEP]", "type_id": 1}}
	],
	"special_tokens": {
	"[CLS]": {"id": "[CLS]", "ids": [0], "tokens": ["[CLS]"]},
	"[SEP]": {"id": "[SEP]", "ids": [1], "tokens": ["[SEP]"]}
	}
	}
	}