| { |
| "model": { |
| "type": "BPE", |
| "vocab": { |
| "<s>": 0, "</s>": 1, "<unk>": 2, |
| "h": 3, "e": 4, "l": 5, "o": 6, |
| "\u0120": 7, "w": 8, "r": 9, "d": 10, |
| "t": 11, "s": 12, |
| "he": 13, "ll": 14, "\u0120w": 15, "or": 16, "ld": 17, |
| "hell": 18, "hello": 19, "orld": 20, "\u0120world": 21 |
| }, |
| "merges": [ |
| "h e", |
| "l l", |
| "\u0120 w", |
| "o r", |
| "l d", |
| "he ll", |
| "hell o", |
| "or ld", |
| "\u0120w orld" |
| ] |
| }, |
| "added_tokens": [ |
| {"id": 0, "content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true}, |
| {"id": 1, "content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true}, |
| {"id": 2, "content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true} |
| ], |
| "pre_tokenizer": {"type": "ByteLevel", "add_prefix_space": false, "trim_offsets": true, "use_regex": true}, |
| "decoder": {"type": "ByteLevel", "add_prefix_space": true, "trim_offsets": true, "use_regex": true}, |
| "post_processor": { |
| "type": "TemplateProcessing", |
| "single": [ |
| {"SpecialToken": {"id": "<s>", "type_id": 0}}, |
| {"Sequence": {"id": "A", "type_id": 0}}, |
| {"SpecialToken": {"id": "</s>", "type_id": 0}} |
| ], |
| "pair": [ |
| {"SpecialToken": {"id": "<s>", "type_id": 0}}, |
| {"Sequence": {"id": "A", "type_id": 0}}, |
| {"SpecialToken": {"id": "</s>", "type_id": 0}}, |
| {"Sequence": {"id": "B", "type_id": 1}}, |
| {"SpecialToken": {"id": "</s>", "type_id": 1}} |
| ], |
| "special_tokens": { |
| "<s>": {"id": "<s>", "ids": [0], "tokens": ["<s>"]}, |
| "</s>": {"id": "</s>", "ids": [1], "tokens": ["</s>"]} |
| } |
| } |
| } |