U
    &c                      @   s   d Z ddlZddlZddlZddlmZ ddlmZ ee	Z
dZddiZdd	d	d	d	d	d
iZdddddd
ZG dd deZdS )z! Tokenization class for model T5.    N)copyfile   )PreTrainedTokenizeru   ▁
vocab_filezspiece.modelzChttps://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model)zt5-smallzt5-basezt5-largezt5-3bzt5-11bi   c                       s|   e Zd ZdZeZeZeZ	d fdd	Z
ed	d
 Zdd Zdd Zdd ZdddZdd Zdd Zdd Zdd Z  ZS )T5Tokenizera)  
        Constructs an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .

        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
        should refer to the superclass for more information regarding methods.

        Args:
            vocab_file (:obj:`string`):
                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
                contains the vocabulary necessary to instantiate a tokenizer.
            eos_token (:obj:`string`, `optional`, defaults to "</s>"):
                The end of sequence token.

                .. note::

                    When building a sequence using special tokens, this is not the token that is used for the end
                    of sequence. The token used is the :obj:`sep_token`.
            unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
                token instead.
            pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
                The token used for padding, for example when batching sequences of different lengths.
            extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`):
                Add a number of extra ids added to the end of the vocabulary for use as sentinels.
                These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
                Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary like in T5 preprocessing
                see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
                Additional special tokens used by the tokenizer.
    </s><unk><pad>d   Nc           	         s   |dkr,|d krg }| dd t|D  t jf ||||d| zdd l}W n  tk
rt   td  Y nX || _|| _	|
 | _| j| d S )Nr   c                 S   s   g | ]}d  |qS )<extra_id_{}>)format.0i r   @/tmp/pip-unpacked-wheel-ymerj3tt/transformers/tokenization_t5.py
<listcomp>p   s     z(T5Tokenizer.__init__.<locals>.<listcomp>)	eos_token	unk_token	pad_tokenadditional_special_tokenszuYou need to install SentencePiece to use T5Tokenizer:https://github.com/google/sentencepiecepip install sentencepiece)extendrangesuper__init__sentencepieceImportErrorloggerwarningr   
_extra_idsSentencePieceProcessorsp_modelLoad)	selfr   r   r   r   Z	extra_idsr   kwargsspm	__class__r   r   r   b   s.    
zT5Tokenizer.__init__c                 C   s   | j  | j S )N)r!   get_piece_sizer   r#   r   r   r   
vocab_size   s    zT5Tokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokensr   r)   r   r   
<dictcomp>   s      z)T5Tokenizer.get_vocab.<locals>.<dictcomp>)r   r*   updateZadded_tokens_encoder)r#   Zvocabr   r)   r   	get_vocab   s    zT5Tokenizer.get_vocabc                 C   s   | j  }d |d< |S )Nr!   )__dict__copy)r#   stater   r   r   __getstate__   s    
zT5Tokenizer.__getstate__c                 C   sP   || _ zdd l}W n  tk
r2   td  Y nX | | _| j| j d S )Nr   zvYou need to install SentencePiece to use T5Tokenizer: https://github.com/google/sentencepiecepip install sentencepiece)	r.   r   r   r   r   r    r!   r"   r   )r#   dr%   r   r   r   __setstate__   s    
zT5Tokenizer.__setstate__Fc                 C   s&   |s| j |}n| j |dd}|S )zZ Take as input a string and return a list of strings (tokens) for words/sub-words
        @   g?)r!   ZEncodeAsPiecesZSampleEncodeAsPieces)r#   textsamplepiecesr   r   r   	_tokenize   s    zT5Tokenizer._tokenizec                 C   s>   | dr2td|}t|d}| j| d S | j|S )z2 Converts a token (str) in an id using the vocab. z
<extra_id_z<extra_id_(\d+)>r   )
startswithrematchintgroupr*   r!   Zpiece_to_id)r#   tokenr;   numr   r   r   _convert_token_to_id   s
    
z T5Tokenizer._convert_token_to_idc                 C   s4   || j  k r| j |}nd| jd | }|S )z=Converts an index (integer) in a token (str) using the vocab.r   r   )r!   r(   Z	IdToPiecer   r*   )r#   indexr>   r   r   r   _convert_id_to_token   s    z T5Tokenizer._convert_id_to_tokenc                 C   s   | j |}|S )z< Converts a sequence of tokens (string) in a single string. )r!   Zdecode_pieces)r#   tokensZ
out_stringr   r   r   convert_tokens_to_string   s    z$T5Tokenizer.convert_tokens_to_stringc                 C   s^   t j|s td| dS t j|td }t j| j	t j|krXt
| j	| |fS )zt Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.
        z*Vocabulary path ({}) should be a directoryNr   )ospathisdirr   errorr   joinVOCAB_FILES_NAMESabspathr   r   )r#   Zsave_directoryZout_vocab_filer   r   r   save_vocabulary   s    zT5Tokenizer.save_vocabulary)r   r   r	   r
   N)F)__name__
__module____qualname____doc__rJ   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesr   propertyr*   r-   r1   r3   r8   r@   rB   rD   rL   __classcell__r   r   r&   r   r   >   s(        (

	r   )rP   loggingrE   r:   shutilr   Ztokenization_utilsr   	getLoggerrM   r   ZSPIECE_UNDERLINErJ   rQ   rR   r   r   r   r   r   <module>   s.   
	