U
    &c9                     @   s   d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
mZ ddlmZ eeZd	d
dZdddddddddddddddZdddddddZG dd de
ZG dd deZdS )z!Tokenization classes for RoBERTa.    N)ListOptional)
AddedToken)RobertaProcessing   )GPT2TokenizerGPT2TokenizerFast)PreTrainedTokenizerz
vocab.jsonz
merges.txt)
vocab_filemerges_filezKhttps://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.jsonzLhttps://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.jsonzQhttps://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.jsonzQhttps://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json)zroberta-basezroberta-largezroberta-large-mnlizdistilroberta-basezroberta-base-openai-detectorzroberta-large-openai-detectorzKhttps://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txtzLhttps://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txtzQhttps://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txtzQhttps://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txti   c                       s   e Zd ZdZeZeZeZ	dgZ
d fd	d
	Zdee eee  ee dddZdee eee  eee dddZdee eee  ee dddZdddZ  ZS )RobertaTokenizera+  
    Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:

    - Byte-level Byte-Pair-Encoding
    - Requires a space to start the input string => the encoding methods should be called with the
      ``add_prefix_space`` flag set to ``True``.
      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
      the absence of a space at the beginning of a string:

    ::

        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.

    Args:
        vocab_file (:obj:`str`):
            Path to the vocabulary file.
        merges_file (:obj:`str`):
            Path to the merges file.
        errors (:obj:`str`, `optional`, defaults to "replace"):
            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
        bos_token (:obj:`string`, `optional`, defaults to "<s>"):
            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.

            .. note::

                When building a sequence using special tokens, this is not the token that is used for the beginning
                of sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
            The end of sequence token.

            .. note::

                When building a sequence using special tokens, this is not the token that is used for the end
                of sequence. The token used is the :obj:`sep_token`.
        sep_token (:obj:`string`, `optional`, defaults to "</s>"):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
            for sequence classification or for a text and a question for question answering.
            It is also used as the last token of a sequence built with special tokens.
        cls_token (:obj:`string`, `optional`, defaults to "<s>"):
            The classifier token which is used when doing sequence classification (classification of the whole
            sequence instead of per-token classification). It is the first token of the sequence when built with
            special tokens.
        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    attention_maskreplace<s></s><unk><pad><mask>c                    s,   t  jf |||||||||	|
d
| d S )N)
r
   r   errors	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_token)super__init__)selfr
   r   r   r   r   r   r   r   r   r   kwargs	__class__ E/tmp/pip-unpacked-wheel-ymerj3tt/transformers/tokenization_roberta.pyr   ~   s    zRobertaTokenizer.__init__N)token_ids_0token_ids_1returnc                 C   sD   |dkr| j g| | jg S | j g}| jg}|| | | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A RoBERTa sequence has the following format:

        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s></s> B </s>``

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of IDs to which the special tokens will be added
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
                Optional second list of IDs for sequence pairs.

        Returns:
            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        N)cls_token_idsep_token_id)r   r$   r%   clssepr"   r"   r#    build_inputs_with_special_tokens   s
    z1RobertaTokenizer.build_inputs_with_special_tokensF)r$   r%   already_has_special_tokensr&   c                    s|   |r*|dk	rt dtt fdd|S |dkrLdgdgt|  dg S dgdgt|  ddg dgt|  dg S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of ids.
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Set to True if the token list is already formatted with special tokens for the model

        Returns:
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        Nz~You should not supply a second sequence if the provided sequence of ids is already formated with special tokens for the model.c                    s   |  j  jfkrdS dS )Nr   r   )r(   r'   )xr   r"   r#   <lambda>       z:RobertaTokenizer.get_special_tokens_mask.<locals>.<lambda>r   r   )
ValueErrorlistmaplen)r   r$   r%   r,   r"   r.   r#   get_special_tokens_mask   s    z(RobertaTokenizer.get_special_tokens_maskc                 C   sP   | j g}| jg}|dkr.t|| | dg S t|| | | | | dg S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        RoBERTa does not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of ids.
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
                Optional second list of IDs for sequence pairs.

        Returns:
            :obj:`List[int]`: List of zeros.

        Nr   )r(   r'   r4   )r   r$   r%   r*   r)   r"   r"   r#   $create_token_type_ids_from_sequences   s
    z5RobertaTokenizer.create_token_type_ids_from_sequencesc                 K   s2   d|kr|d }n|}|r.|d   s.d| }|S )Nadd_prefix_spacer    )isspace)r   textadd_special_tokensr   r7   r"   r"   r#   prepare_for_tokenization   s    
z)RobertaTokenizer.prepare_for_tokenization)r   r   r   r   r   r   r   r   )N)NF)N)F)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesmodel_input_namesr   r   intr   r+   boolr5   r6   r<   __classcell__r"   r"   r    r#   r   @   sF   8          
    
   
r   c                
       sN   e Zd ZdZeZeZeZ	dgZ
d fd
d	Zejjdd ZdddZ  ZS )RobertaTokenizerFasta  
    Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library).

    Peculiarities:

    - Byte-level Byte-Pair-Encoding
    - Requires a space to start the input string => the encoding methods should be called with the
      ``add_prefix_space`` flag set to ``True``.
      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
      the absence of a space at the beginning of a string:

    ::

        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.

    Args:
        vocab_file (:obj:`str`):
            Path to the vocabulary file.
        merges_file (:obj:`str`):
            Path to the merges file.
        errors (:obj:`str`, `optional`, defaults to "replace"):
            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
        unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
            The beginning of sequence token.
        eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
            The end of sequence token.
        add_prefix_space (:obj:`bool`, `optional`, defaults to `False`):
            Whether to add a leading space to the first word.
            This allows to treat the leading word just as any other word.
            (GPT2 tokenizer detect beginning of words by the preceeding space)
        trim_offsets (:obj:`bool`, `optional`, defaults to `True`):
            Whether the post processing step should trim offsets to avoid including whitespaces.
    r   r   r   r   r   r   r   Tc              
      s   | d|	 | d| | d| | d|
 t jf |||||||d| t|| jf|| jf||d| jj_| j	|d g d S )Nr   r   r   r   )r
   r   r   r   r   r7   trim_offsets)r*   r)   r7   rL   )

setdefaultr   r   r   r(   r'   Zbackend_tokenizer
_tokenizerZpost_processorr;   )r   r
   r   r   r   r   r   r   r   r   r   r7   rL   r   r    r"   r#   r   #  s,    zRobertaTokenizerFast.__init__c                 C   s0   t |tst|dd}t|| _| |g d S )NT)lstrip)
isinstancer   strZ_mask_tokenZ_maybe_update_backend)r   valuer"   r"   r#   r   L  s    

zRobertaTokenizerFast.mask_tokenNc                 C   s8   | j g| | jg }|d kr |S || jg | | jg S )N)Zbos_token_idZeos_token_id)r   r$   r%   outputr"   r"   r#   r+   T  s    z5RobertaTokenizerFast.build_inputs_with_special_tokens)
r   r   r   r   r   r   r   r   TT)N)r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   r   r	   r   setterr+   rJ   r"   r"   r    r#   rK      s&   )          )
rK   )r@   loggingtypingr   r   Z
tokenizersr   Ztokenizers.processorsr   Ztokenization_gpt2r   r   Ztokenization_utilsr	   	getLoggerr=   loggerrA   rC   rE   r   rK   r"   r"   r"   r#   <module>   sF   
	
 5