U
    &cc                     @   sj   d Z ddlZddlZddlmZ ddlmZ eeZ	dZ
ddiZdd	d
iiZd	diZG dd deZdS )z' Tokenization class for model Reformer.    N)copyfile   )PreTrainedTokenizeru   ▁
vocab_filezspiece.modelz$google/reformer-crime-and-punishmentzLhttps://cdn.huggingface.co/google/reformer-crime-and-punishment/spiece.modeli   c                       s   e Zd ZdZeZeZeZ	dddg f fdd	Z
edd Zd	d
 Zdd Zdd ZdddZdd Zdd Zdd Zdd Z  ZS )ReformerTokenizera  
        Constructs an Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .

        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
        should refer to the superclass for more information regarding methods.

        Args:
            vocab_file (:obj:`string`):
                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
                contains the vocabulary necessary to instantiate a tokenizer.
            eos_token (:obj:`string`, `optional`, defaults to "</s>"):
                The end of sequence token.

                .. note::

                    When building a sequence using special tokens, this is not the token that is used for the end
                    of sequence. The token used is the :obj:`sep_token`.
            unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
                token instead.
            pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
                The token used for padding, for example when batching sequences of different lengths.
            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
                Additional special tokens used by the tokenizer.
    z</s>z<unk>z<pad>c                    sj   t  jf ||||d| zdd l}W n  tk
rH   td  Y nX || _| | _| j	| d S )N)	eos_token	unk_token	pad_tokenadditional_special_tokensr   z{You need to install SentencePiece to use ReformerTokenizer:https://github.com/google/sentencepiecepip install sentencepiece)
super__init__sentencepieceImportErrorloggerwarningr   SentencePieceProcessorsp_modelLoad)selfr   r   r   r	   r
   kwargsspm	__class__ F/tmp/pip-unpacked-wheel-ymerj3tt/transformers/tokenization_reformer.pyr   U   s$    	
zReformerTokenizer.__init__c                 C   s
   | j  S )N)r   get_piece_sizer   r   r   r   
vocab_sizet   s    zReformerTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokens).0ir   r   r   
<dictcomp>y   s      z/ReformerTokenizer.get_vocab.<locals>.<dictcomp>)ranger   updateZadded_tokens_encoder)r   Zvocabr   r   r   	get_vocabx   s    zReformerTokenizer.get_vocabc                 C   s   | j  }d |d< |S )Nr   )__dict__copy)r   stater   r   r   __getstate__}   s    
zReformerTokenizer.__getstate__c                 C   sP   || _ zdd l}W n  tk
r2   td  Y nX | | _| j| j d S )Nr   z|You need to install SentencePiece to use ReformerTokenizer: https://github.com/google/sentencepiecepip install sentencepiece)	r$   r   r   r   r   r   r   r   r   )r   dr   r   r   r   __setstate__   s    
zReformerTokenizer.__setstate__Fc                 C   s&   |s| j |}n| j |dd}|S )zZ Take as input a string and return a list of strings (tokens) for words/sub-words
        @   g?)r   ZEncodeAsPiecesZSampleEncodeAsPieces)r   textsamplepiecesr   r   r   	_tokenize   s    zReformerTokenizer._tokenizec                 C   s   | j |S )z2 Converts a token (str) in an id using the vocab. )r   Zpiece_to_id)r   tokenr   r   r   _convert_token_to_id   s    z&ReformerTokenizer._convert_token_to_idc                 C   s   || j  k r| j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r   r   Z	IdToPiece)r   indexr/   r   r   r   _convert_id_to_token   s    z&ReformerTokenizer._convert_id_to_tokenc                 C   s   | j |}|S )z< Converts a sequence of tokens (string) in a single string. )r   Zdecode_pieces)r   tokensZ
out_stringr   r   r   convert_tokens_to_string   s    z*ReformerTokenizer.convert_tokens_to_stringc                 C   s^   t j|s td| dS t j|td }t j| j	t j|krXt
| j	| |fS )zt Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.
        z*Vocabulary path ({}) should be a directoryNr   )ospathisdirr   errorformatjoinVOCAB_FILES_NAMESabspathr   r   )r   Zsave_directoryZout_vocab_filer   r   r   save_vocabulary   s    z!ReformerTokenizer.save_vocabulary)F)__name__
__module____qualname____doc__r;   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesr   propertyr   r#   r'   r)   r.   r0   r2   r4   r=   __classcell__r   r   r   r   r   6   s&   

	r   )rA   loggingr5   shutilr   Ztokenization_utilsr   	getLoggerr>   r   ZSPIECE_UNDERLINEr;   rB   rC   r   r   r   r   r   <module>   s    
 
 