U
    &cU&                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZ eeZddd	Zd
did
did	Zd
diZdd Zdd ZG dd de
ZG dd deZdS )z$Tokenization classes for OpenAI GPT.    N)CharBPETokenizer   )BasicTokenizer)PreTrainedTokenizerPreTrainedTokenizerFastz
vocab.jsonz
merges.txt)
vocab_filemerges_filez
openai-gptzIhttps://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.jsonzIhttps://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txti   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )z
    Return set of symbol pairs in a word.
    word is represented as tuple of symbols (symbols being variable-length strings)
    r   r   N)setadd)wordpairsZ	prev_charchar r   D/tmp/pip-unpacked-wheel-ymerj3tt/transformers/tokenization_openai.py	get_pairs.   s    r   c                 C   sn   |  dd} |  dd} |  dd} |  dd} |  dd} td	d
| } tdd| } tdd| } |  S )zq
    fixes some issues the spacy tokenizer had on books corpus
    also does some whitespace standardization
    u   —-u   –u   ―u   …z...   ´'zD(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)z \1 z\s*\n\s*z 
 z[^\S\n]+ )replaceresubstrip)textr   r   r   text_standardize;   s    r   c                       sr   e Zd ZdZeZeZeZ	d fdd	Z
edd Zdd Zd	d
 Zdd Zdd Zdd Zdd Zdd Z  ZS )OpenAIGPTTokenizera  
    BPE tokenizer. Peculiarities:

    - lower case all inputs
    - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.

    Args:
        vocab_file (:obj:`str`):
            Path to the vocabulary file.
        merges_file (:obj:`str`):
            Path to the merges file.
        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    <unk>c              	      s  t  jf d|i| z4dd l}ddlm} | }|j|| _|j| _W n0 t	k
rz   t
d tdd| _d | _Y nX t|dd}t|| _W 5 Q R X d	d
 | j D | _t|dd}	|	 ddd }
W 5 Q R X dd |
D }
tt|
tt|
| _i | _d S )N	unk_tokenr   )EnglishzQftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.T)Zdo_lower_caseutf-8encodingc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>t   s      z/OpenAIGPTTokenizer.__init__.<locals>.<dictcomp>
r   c                 S   s   g | ]}t | qS r   )tuplesplit)r"   merger   r   r   
<listcomp>w   s     z/OpenAIGPTTokenizer.__init__.<locals>.<listcomp>)super__init__ftfyZspacy.lang.enr   ZDefaultsZcreate_tokenizernlpfix_textImportErrorloggerwarningr   openjsonloadencoderitemsdecoderreadr)   dictziprangelen	bpe_rankscache)selfr   r   r   kwargsr.   r   Z_nlpZvocab_handleZmerges_handleZmerges	__class__r   r   r-   c   s&    
 zOpenAIGPTTokenizer.__init__c                 C   s
   t | jS N)r>   r7   rA   r   r   r   
vocab_size{   s    zOpenAIGPTTokenizer.vocab_sizec                 C   s   t | jf| jS rE   )r;   r7   Zadded_tokens_encoderrF   r   r   r   	get_vocab   s    zOpenAIGPTTokenizer.get_vocabc           
         s  t |d d |d d f }| jkr2 j| S t|}|sF|d S t| fddd}| jkrhqf|\}}g }d}|t|k r<z|||}	W n, tk
r   |||d   Y q<Y nX ||||	  |	}|| |kr$|t|d k r$||d  |kr$|	||  |d7 }qx|	||  |d7 }qxt |}|}t|dkr\qfqFt|}qFd	
|}|d
kr~d}| j|< |S )Nr'   </w>c                    s    j | tdS )Ninf)r?   getfloat)pairrF   r   r   <lambda>       z(OpenAIGPTTokenizer.bpe.<locals>.<lambda>keyr   r      r   z
  </w>z
</w>)r(   r@   r   minr?   r>   index
ValueErrorextendappendjoin)
rA   tokenr   r   ZbigramfirstsecondZnew_wordijr   rF   r   bpe   sF    


2





zOpenAIGPTTokenizer.bpec                 C   s   g }| j dkrF| j|}|D ]$}|dd | |dD  qnD| t|  |}|D ]*}|dd | |j dD  q^|S )z Tokenize a string. Nc                 S   s   g | ]}|qS r   r   r"   tr   r   r   r+      s     z0OpenAIGPTTokenizer._tokenize.<locals>.<listcomp>r   c                 S   s   g | ]}|qS r   r   r_   r   r   r   r+      s     )	r0   r/   tokenizerV   r^   r)   r   r   lower)rA   r   Zsplit_tokensrY   r   r   r   	_tokenize   s    
$(zOpenAIGPTTokenizer._tokenizec                 C   s   | j || j | jS )z2 Converts a token (str) in an id using the vocab. )r7   rK   r   )rA   rY   r   r   r   _convert_token_to_id   s    z'OpenAIGPTTokenizer._convert_token_to_idc                 C   s   | j || jS )z0Converts an id in a token (BPE) using the vocab.)r9   rK   r   )rA   rT   r   r   r   _convert_id_to_token   s    z'OpenAIGPTTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z< Converts a sequence of tokens (string) in a single string.  rI   r   )rX   r   r   )rA   tokensZ
out_stringr   r   r   convert_tokens_to_string   s    z+OpenAIGPTTokenizer.convert_tokens_to_stringc           	   	   C   s   t j|s td| dS t j|td }t j|td }t|ddd}|	t
j| jdd	 W 5 Q R X d
}t|dddh}|	d t| j dd dD ]@\}}||krtd| |}|	d|d  |d7 }qW 5 Q R X ||fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (:obj:`str`):
                The directory in which to save the vocabulary.

        Returns:
            :obj:`Tuple(str)`: Paths to the files saved.
        z*Vocabulary path ({}) should be a directoryNr   r   wr   r    F)ensure_asciir   z#version: 0.2
c                 S   s   | d S )Nr   r   )kvr   r   r   rN      rO   z4OpenAIGPTTokenizer.save_vocabulary.<locals>.<lambda>rP   zqSaving vocabulary to {}: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r   r&   r   )ospathisdirr2   errorformatrX   VOCAB_FILES_NAMESr4   writer5   dumpsr7   sortedr?   r8   r3   )	rA   Zsave_directoryr   Z
merge_filefrT   writerZ
bpe_tokensZtoken_indexr   r   r   save_vocabulary   s*     
z"OpenAIGPTTokenizer.save_vocabulary)r   )__name__
__module____qualname____doc__rq   vocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesr-   propertyrG   rH   r^   rc   rd   re   rh   rw   __classcell__r   r   rC   r   r   K   s   
,r   c                       s.   e Zd ZdZeZeZeZ	d fdd	Z
  ZS )OpenAIGPTTokenizerFastaM  
    Construct a "Fast" BPE tokenizer for OpenAI GPT (backed by HuggingFace's `tokenizers` library).

    Peculiarities:

    - lower case all inputs
    - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.

    Args:
        vocab_file (:obj:`str`):
            Path to the vocabulary file.
        merges_file (:obj:`str`):
            Path to the merges file.
        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    r   c                    s,   | d| t jt|||ddf| d S )Nr   T)r   r   r   Z	lowercase)
setdefaultr,   r-   r   )rA   r   r   r   rB   rC   r   r   r-     s    zOpenAIGPTTokenizerFast.__init__)r   )rx   ry   rz   r{   rq   r|   r}   r~   r   r   r-   r   r   r   rC   r   r      s
   r   )r{   r5   loggingrl   r   Z
tokenizersr   Ztokenization_bertr   Ztokenization_utilsr   r   	getLoggerrx   r2   rq   r}   r   r   r   r   r   r   r   r   r   <module>   s,   
  $