U
    &c4                     @   s   d Z ddlZddlZddlZddlmZ ddlZddlm	Z	 ddl
mZmZ eeZddd	Zd
dddddddddddd	ZddddddZe dd Zdd ZG dd deZG dd deZdS )z$Tokenization classes for OpenAI GPT.    N)	lru_cache)ByteLevelBPETokenizer   )PreTrainedTokenizerPreTrainedTokenizerFastz
vocab.jsonz
merges.txt)
vocab_filemerges_filezChttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.jsonzJhttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.jsonzIhttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.jsonzFhttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.jsonzIhttps://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json)Zgpt2zgpt2-mediumz
gpt2-largezgpt2-xlZ
distilgpt2zChttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txtzJhttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txtzIhttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txtzFhttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txtzIhttps://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txti   c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ],}|| krf| | |d
|  |d7 }qfdd |D }tt| |S )a;  
    Returns list of utf-8 byte and a mapping to unicode strings.
    We specifically avoids mapping to whitespace/control characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS  )chr).0nr   r   B/tmp/pip-unpacked-wheel-ymerj3tt/transformers/tokenization_gpt2.py
<listcomp>T   s     z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode>   s    L

r   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairsZ	prev_charcharr   r   r   	get_pairsX   s    r%   c                       sz   e Zd ZdZeZeZeZ	d fdd	Z
edd Zdd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zdd Z  ZS )GPT2Tokenizera  
    GPT-2 BPE tokenizer. Peculiarities:

    - Byte-level Byte-Pair-Encoding
    - Requires a space to start the input string => the encoding methods should be called with the
      ``add_prefix_space`` flag set to ``True``.
      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
      the absence of a space at the beginning of a string:

    ::

        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.

    Args:
        vocab_file (:obj:`str`):
            Path to the vocabulary file.
        merges_file (:obj:`str`):
            Path to the merges file.
        errors (:obj:`str`, `optional`, defaults to "replace"):
            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
        unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
            The beginning of sequence token.
        eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
            The end of sequence token.
    replace<|endoftext|>c              	      s   t  jf |||d| t|dd}t|| _W 5 Q R X dd | j D | _|| _t	 | _
dd | j
 D | _t|dd}	|	 ddd	 }
W 5 Q R X d
d |
D }
tt|
tt|
| _i | _td| _d S )N	bos_token	eos_token	unk_tokenutf-8encodingc                 S   s   i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>   s      z*GPT2Tokenizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r   r   r0   r   r   r   r3      s      
r   c                 S   s   g | ]}t | qS r   )tuplesplit)r   merger   r   r   r      s     z*GPT2Tokenizer.__init__.<locals>.<listcomp>zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)super__init__openjsonloadencoderitemsdecodererrorsr   byte_encoderbyte_decoderreadr7   r   r   r   len	bpe_rankscacherecompilepat)selfr   r   rA   r,   r*   r+   kwargsZvocab_handleZmerges_handleZ
bpe_merges	__class__r   r   r:      s    
 zGPT2Tokenizer.__init__c                 C   s
   t | jS N)rE   r>   rK   r   r   r   
vocab_size   s    zGPT2Tokenizer.vocab_sizec                 C   s   t | jf| jS rO   )r   r>   Zadded_tokens_encoderrP   r   r   r   	get_vocab   s    zGPT2Tokenizer.get_vocabc           
         sd  | j kr j | S t|}t|}|s,|S t| fddd}| jkrNqL|\}}g }d}|t|k r"z|||}	W n, tk
r   |||d   Y q"Y nX ||||	  |	}|| |kr
|t|d k r
||d  |kr
|	||  |d7 }q^|	||  |d7 }q^t|}|}t|dkrBqLq,t|}q,d
|}| j |< |S )Nc                    s    j | tdS )Ninf)rF   getfloat)pairrP   r   r   <lambda>       z#GPT2Tokenizer.bpe.<locals>.<lambda>keyr   r       )rG   r6   r%   minrF   rE   index
ValueErrorextendr   join)
rK   tokenr"   r#   ZbigramfirstsecondZnew_wordijr   rP   r   bpe   sB    


2




zGPT2Tokenizer.bpec                    sZ   g }t  j|D ]B}d fdd|dD }|dd  |dD  q|S )z Tokenize a string.  c                 3   s   | ]} j | V  qd S rO   )rB   )r   r   rP   r   r   	<genexpr>   s    z*GPT2Tokenizer._tokenize.<locals>.<genexpr>r-   c                 s   s   | ]
}|V  qd S rO   r   )r   Z	bpe_tokenr   r   r   ri      s     r\   )rH   findallrJ   ra   encoder`   rg   r7   )rK   text
bpe_tokensrb   r   rP   r   	_tokenize   s    "zGPT2Tokenizer._tokenizec                 C   s   | j || j | jS )z2 Converts a token (str) in an id using the vocab. )r>   rT   r,   )rK   rb   r   r   r   _convert_token_to_id   s    z"GPT2Tokenizer._convert_token_to_idc                 C   s   | j |S )z=Converts an index (integer) in a token (str) using the vocab.)r@   rT   )rK   r^   r   r   r   _convert_id_to_token   s    z"GPT2Tokenizer._convert_id_to_tokenc                    s0   d |}t fdd|D jd jd}|S )z< Converts a sequence of tokens (string) in a single string. rh   c                    s   g | ]} j | qS r   )rC   )r   crP   r   r   r      s     z:GPT2Tokenizer.convert_tokens_to_string.<locals>.<listcomp>r-   )rA   )ra   	bytearraydecoderA   )rK   tokensrl   r   rP   r   convert_tokens_to_string   s    
"z&GPT2Tokenizer.convert_tokens_to_stringc           	   	   C   s   t j|s td| dS t j|td }t j|td }t|ddd}|	t
j| jdd	 W 5 Q R X d
}t|dddh}|	d t| j dd dD ]@\}}||krtd| |}|	d|d  |d7 }qW 5 Q R X ||fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (:obj:`str`):
                The directory in which to save the vocabulary.

        Returns:
            :obj:`Tuple(str)`: Paths to the files saved.
        z*Vocabulary path ({}) should be a directoryNr   r   wr-   r.   F)ensure_asciir   z#version: 0.2
c                 S   s   | d S )Nr   r   )kvr   r   r   rW     rX   z/GPT2Tokenizer.save_vocabulary.<locals>.<lambda>rY   zqSaving vocabulary to {}: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r\   r4   r   )ospathisdirloggererrorformatra   VOCAB_FILES_NAMESr;   writer<   dumpsr>   sortedrF   r?   warning)	rK   Zsave_directoryr   Z
merge_filefr^   writerrm   Ztoken_indexr   r   r   save_vocabulary   s*     
zGPT2Tokenizer.save_vocabularyc                 K   s   d|kr|d rd| S |S )Nadd_prefix_spacer\   r   )rK   rl   rL   r   r   r   prepare_for_tokenization  s    z&GPT2Tokenizer.prepare_for_tokenization)r'   r(   r(   r(   )__name__
__module____qualname____doc__r   vocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesr:   propertyrQ   rR   rg   rn   ro   rp   ru   r   r   __classcell__r   r   rM   r   r&   e   s&   !    
*
#r&   c                       s.   e Zd ZdZeZeZeZ	d fdd	Z
  ZS )GPT2TokenizerFasta  
    Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library).

    Peculiarities:

    - Byte-level Byte-Pair-Encoding
    - Requires a space to start the input string => the encoding methods should be called with the
      ``add_prefix_space`` flag set to ``True``.
      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
      the absence of a space at the beginning of a string:

    ::

        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.

    Args:
        vocab_file (:obj:`str`):
            Path to the vocabulary file.
        merges_file (:obj:`str`):
            Path to the merges file.
        errors (:obj:`str`, `optional`, defaults to "replace"):
            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
        unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
            The beginning of sequence token.
        eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
            The end of sequence token.
        add_prefix_space (:obj:`bool`, `optional`, defaults to `False`):
            Whether to add a leading space to the first word.
            This allows to treat the leading word just as any other word.
            (GPT2 tokenizer detect beginning of words by the preceeding space)
        trim_offsets (:obj:`bool`, `optional`, defaults to `True`):
            Whether the post processing step should trim offsets to avoid including whitespaces.
    r(   FTc           	         s,   t  jt||||df|||d| d S )N)r   r   r   trim_offsetsr)   )r9   r:   r   )	rK   r   r   r,   r*   r+   r   r   rL   rM   r   r   r:   F  s    
zGPT2TokenizerFast.__init__)r(   r(   r(   FT)r   r   r   r   r   r   r   r   r   r   r:   r   r   r   rM   r   r     s   )     r   )r   r<   loggingry   	functoolsr   regexrH   Z
tokenizersr   Ztokenization_utilsr   r   	getLoggerr   r|   r   r   r   r   r%   r&   r   r   r   r   r   <module>   sH   
	
 4