tools/test/iree-tokenize.json - 3p/openxla/iree - Git at Google

 {
   "model": {
     "type": "BPE",
     "vocab": {
       "<s>": 0, "</s>": 1, "<unk>": 2,
       "h": 3, "e": 4, "l": 5, "o": 6,
       "\u0120": 7, "w": 8, "r": 9, "d": 10,
       "t": 11, "s": 12,
       "he": 13, "ll": 14, "\u0120w": 15, "or": 16, "ld": 17,
       "hell": 18, "hello": 19, "orld": 20, "\u0120world": 21
     },
     "merges": [
       "h e",
       "l l",
       "\u0120 w",
       "o r",
       "l d",
       "he ll",
       "hell o",
       "or ld",
       "\u0120w orld"
     ]
   },
   "added_tokens": [
     {"id": 0, "content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true},
     {"id": 1, "content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true},
     {"id": 2, "content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true}
   ],
   "pre_tokenizer": {"type": "ByteLevel", "add_prefix_space": false, "trim_offsets": true, "use_regex": true},
   "decoder": {"type": "ByteLevel", "add_prefix_space": true, "trim_offsets": true, "use_regex": true},
   "post_processor": {
     "type": "TemplateProcessing",
     "single": [
       {"SpecialToken": {"id": "<s>", "type_id": 0}},
       {"Sequence": {"id": "A", "type_id": 0}},
       {"SpecialToken": {"id": "</s>", "type_id": 0}}
     ],
     "pair": [
       {"SpecialToken": {"id": "<s>", "type_id": 0}},
       {"Sequence": {"id": "A", "type_id": 0}},
       {"SpecialToken": {"id": "</s>", "type_id": 0}},
       {"Sequence": {"id": "B", "type_id": 1}},
       {"SpecialToken": {"id": "</s>", "type_id": 1}}
     ],
     "special_tokens": {
       "<s>": {"id": "<s>", "ids": [0], "tokens": ["<s>"]},
       "</s>": {"id": "</s>", "ids": [1], "tokens": ["</s>"]}
     }
   }
 }
	{
	"model": {
	"type": "BPE",
	"vocab": {
	"<s>": 0, "</s>": 1, "<unk>": 2,
	"h": 3, "e": 4, "l": 5, "o": 6,
	"\u0120": 7, "w": 8, "r": 9, "d": 10,
	"t": 11, "s": 12,
	"he": 13, "ll": 14, "\u0120w": 15, "or": 16, "ld": 17,
	"hell": 18, "hello": 19, "orld": 20, "\u0120world": 21
	},
	"merges": [
	"h e",
	"l l",
	"\u0120 w",
	"o r",
	"l d",
	"he ll",
	"hell o",
	"or ld",
	"\u0120w orld"
	]
	},
	"added_tokens": [
	{"id": 0, "content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true},
	{"id": 1, "content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true},
	{"id": 2, "content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true}
	],
	"pre_tokenizer": {"type": "ByteLevel", "add_prefix_space": false, "trim_offsets": true, "use_regex": true},
	"decoder": {"type": "ByteLevel", "add_prefix_space": true, "trim_offsets": true, "use_regex": true},
	"post_processor": {
	"type": "TemplateProcessing",
	"single": [
	{"SpecialToken": {"id": "<s>", "type_id": 0}},
	{"Sequence": {"id": "A", "type_id": 0}},
	{"SpecialToken": {"id": "</s>", "type_id": 0}}
	],
	"pair": [
	{"SpecialToken": {"id": "<s>", "type_id": 0}},
	{"Sequence": {"id": "A", "type_id": 0}},
	{"SpecialToken": {"id": "</s>", "type_id": 0}},
	{"Sequence": {"id": "B", "type_id": 1}},
	{"SpecialToken": {"id": "</s>", "type_id": 1}}
	],
	"special_tokens": {
	"<s>": {"id": "<s>", "ids": [0], "tokens": ["<s>"]},
	"</s>": {"id": "</s>", "ids": [1], "tokens": ["</s>"]}
	}
	}
	}