|
- r"""
-
- """
-
- __all__ = [
- "RobertaTokenizer"
- ]
-
- import json
- from .gpt2_tokenizer import GPT2Tokenizer
- from ..utils import _get_file_name_base_on_postfix
- from ...io.file_utils import _get_roberta_dir
-
- PRETRAINED_ROBERTA_POSITIONAL_EMBEDDINGS_SIZES = {
- "roberta-base": 512,
- "roberta-large": 512,
- "roberta-large-mnli": 512,
- "distilroberta-base": 512,
- "roberta-base-openai-detector": 512,
- "roberta-large-openai-detector": 512,
- }
-
-
- class RobertaTokenizer(GPT2Tokenizer):
-
- vocab_files_names = {
- "vocab_file": "vocab.json",
- "merges_file": "merges.txt",
- }
-
- def __init__(
- self,
- vocab_file,
- merges_file,
- errors="replace",
- bos_token="<s>",
- eos_token="</s>",
- sep_token="</s>",
- cls_token="<s>",
- unk_token="<unk>",
- pad_token="<pad>",
- mask_token="<mask>",
- **kwargs
- ):
- super().__init__(
- vocab_file=vocab_file,
- merges_file=merges_file,
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- **kwargs,
- )
- self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
- self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
-
- @classmethod
- def from_pretrained(cls, model_dir_or_name, *inputs, **kwargs):
- """
-
- :param str model_dir_or_name: 目录或者缩写名
- :param kwargs:
- :return:
- """
- # 它需要两个文件,第一个是vocab.json,第二个是merge_file?
- model_dir = _get_roberta_dir(model_dir_or_name)
- # 里面会包含四个文件vocab.json, merge.txt, config.json, model.bin
-
- tokenizer_config_file = _get_file_name_base_on_postfix(model_dir, 'config.json')
- with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
- init_kwargs = json.load(tokenizer_config_handle)
- # Set max length if needed
- if model_dir_or_name in PRETRAINED_ROBERTA_POSITIONAL_EMBEDDINGS_SIZES:
- # if we're using a pretrained model, ensure the tokenizer
- # wont index sequences longer than the number of positional embeddings
- max_len = PRETRAINED_ROBERTA_POSITIONAL_EMBEDDINGS_SIZES[model_dir_or_name]
- if max_len is not None and isinstance(max_len, (int, float)):
- init_kwargs["max_len"] = min(init_kwargs.get("max_len", int(1e12)), max_len)
-
- # 将vocab, merge加入到init_kwargs中
- if 'vocab_file' in kwargs: # 如果指定了词表则用指定词表
- init_kwargs['vocab_file'] = kwargs['vocab_file']
- else:
- init_kwargs['vocab_file'] = _get_file_name_base_on_postfix(model_dir, RobertaTokenizer.vocab_files_names['vocab_file'])
- init_kwargs['merges_file'] = _get_file_name_base_on_postfix(model_dir, RobertaTokenizer.vocab_files_names['merges_file'])
-
- init_inputs = init_kwargs.pop("init_inputs", ())
- # Instantiate tokenizer.
- try:
- tokenizer = cls(*init_inputs, **init_kwargs)
- except OSError:
- OSError(
- "Unable to load vocabulary from file. "
- "Please check that the provided vocabulary is accessible and not corrupted."
- )
-
- return tokenizer
|