U
    &ch!                  8   @   s   d Z ddlZddlZddlZddlZddlmZ ee	Z
dddZdd	idd
idZddiZddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdC7ZdDdE ZG dFdG dGeZdS )Hz)Tokenization classes for Salesforce CTRL.    N   )PreTrainedTokenizerz
vocab.jsonz
merges.txt)
vocab_filemerges_fileZctrlzHhttps://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.jsonzHhttps://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt   i i  i i  i  i#j  iv i~ i6  i  iv  i i.  i i  iך  iͨ  i  i%  i  i  i3  iR- in  iS.  iK  i iw  i  i[  i*  i  i  i/  i?  i in1  i  ip i  i i iϒ  i	  i) i- i( i  iK i  i iǢ  i  ih  i )7Z	PregnancyZChristianityZExplainZFitnessZSavingZAskZAssZJokeZ	QuestionsZThoughtsZRetailZFeminismZWritingZAtheismZNetflixZ	ComputingZOpinionZAloneFunnyZGamingZHumanZIndiaZJokerZDietZLegalZNormanZTipZWeightZMoviesZRunningZScienceZHorrorZ
ConfessionZFinanceZPoliticsZScaryZSupportZTechnologiesZTeenageEventZLearnedZNotionZ	WikipediaZBooksZExtractZConfessionsZ
ConspiracyZLinksZ	NarcissusZRelationshipZRelationshipsZReviewsZNewsZTranslationZmultilingualc                 C   s>   t  }| d }| dd D ]}|||f |}qt |}|S )zReturn set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairsZ	prev_charchar r   B/tmp/pip-unpacked-wheel-ymerj3tt/transformers/tokenization_ctrl.py	get_pairsf   s    r   c                       sv   e Zd ZdZeZeZeZ	e
Zd fdd	Zedd Zdd Zd	d
 Zdd Zdd Zdd Zdd Zdd Z  ZS )CTRLTokenizera  
    Constructs a CTRL tokenizer. Peculiarities:

    - Byte-Pair-Encoding

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.

    Args:
        vocab_file (:obj:`str`):
            Path to the vocabulary file.
        merges_file (:obj:`str`):
            Path to the merges file.
        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    <unk>c              	      s   t  jf d|i| t|dd}t|| _W 5 Q R X dd | j D | _t|dd}| 	ddd }W 5 Q R X d	d
 |D }t
t|tt|| _i | _d S )N	unk_tokenutf-8encodingc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>   s      z*CTRLTokenizer.__init__.<locals>.<dictcomp>
r   c                 S   s   g | ]}t | qS r   )tuplesplit)r   merger   r   r   
<listcomp>   s     z*CTRLTokenizer.__init__.<locals>.<listcomp>)super__init__openjsonloadencoderitemsdecoderreadr   dictziprangelen	bpe_rankscache)selfr   r   r   kwargsZvocab_handleZmerges_handleZmerges	__class__r   r   r"      s     zCTRLTokenizer.__init__c                 C   s
   t | jS N)r-   r&   r0   r   r   r   
vocab_size   s    zCTRLTokenizer.vocab_sizec                 C   s   t | jf| jS r4   )r*   r&   Zadded_tokens_encoderr5   r   r   r   	get_vocab   s    zCTRLTokenizer.get_vocabc           
         s  | j kr j | S t|}tt|d d |d d g }t|}|sN|S t| fddd}| jkrpqn|\}}g }d}|t|k rDz|||}	W n, tk
r   |	||d   Y qDY nX |	|||	  |	}|| |kr,|t|d k r,||d  |kr,|
||  |d7 }q|
||  |d7 }qt|}|}t|dkrdqnqNt|}qNd	|}|d d
 }| j |< |S )Nr   z</w>c                    s    j | tdS )Ninf)r.   getfloat)pairr5   r   r   <lambda>       z#CTRLTokenizer.bpe.<locals>.<lambda>keyr   r      @@ )r/   r   listr   minr.   r-   index
ValueErrorextendappendjoin)
r0   tokenr   r   ZbigramfirstsecondZnew_wordijr   r5   r   bpe   sF    

"
2




zCTRLTokenizer.bpec                 C   s>   g }t d|}|D ]$}|dd | |dD  q|S )z Tokenize a string.
        z\S+\n?c                 S   s   g | ]}|qS r   r   )r   tr   r   r   r       s     z+CTRLTokenizer._tokenize.<locals>.<listcomp> )refindallrG   rO   r   )r0   textZsplit_tokenswordsrJ   r   r   r   	_tokenize   s
    "zCTRLTokenizer._tokenizec                 C   s   | j || j | jS )z2 Converts a token (str) in an id using the vocab. )r&   r9   r   )r0   rJ   r   r   r   _convert_token_to_id   s    z"CTRLTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r(   r9   r   )r0   rE   r   r   r   _convert_id_to_token   s    z"CTRLTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z< Converts a sequence of tokens (string) in a single string. rQ   rA    )rI   replacestrip)r0   tokensZ
out_stringr   r   r   convert_tokens_to_string   s    z&CTRLTokenizer.convert_tokens_to_stringc           	   	   C   s   t j|s td| dS t j|td }t j|td }t|ddd}|	t
j| jdd	 W 5 Q R X d
}t|dddh}|	d t| j dd dD ]@\}}||krtd| |}|	d|d  |d7 }qW 5 Q R X ||fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (:obj:`str`):
                The directory in which to save the vocabulary.

        Returns:
            :obj:`Tuple(str)`: Paths to the files saved.
        z*Vocabulary path ({}) should be a directoryNr   r   wr   r   F)ensure_asciir   z#version: 0.2
c                 S   s   | d S )Nr   r   )kvr   r   r   r<      r=   z/CTRLTokenizer.save_vocabulary.<locals>.<lambda>r>   zqSaving vocabulary to {}: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rQ   r   r   )ospathisdirloggererrorformatrI   VOCAB_FILES_NAMESr#   writer$   dumpsr&   sortedr.   r'   warning)	r0   Zsave_directoryr   Z
merge_filefrE   writerZ
bpe_tokensZtoken_indexr   r   r   save_vocabulary   s*     
zCTRLTokenizer.save_vocabulary)r   )__name__
__module____qualname____doc__rg   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesCONTROL_CODESZcontrol_codesr"   propertyr6   r7   rO   rV   rW   rX   r]   rn   __classcell__r   r   r2   r   r   u   s   
,r   )rr   r$   loggingra   regexrR   Ztokenization_utilsr   	getLoggerro   rd   rg   rs   rt   ru   r   r   r   r   r   r   <module>   s   
 ;