U
    #cY                     @   st   d dl mZmZmZmZmZ d dlmZ d dlm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZ G dd	 d	eZd
S )   )	Tokenizer
AddedTokenpre_tokenizersdecoderstrainers)BPE)Sequence	Lowercaseunicode_normalizer_from_strBertNormalizer   )BaseTokenizer    )OptionalListUnionc                       s   e Zd ZdZdee ee eeef eee e	ee e	e	d	 fdd	Z
d
ddgdg ddfeeee f eeeeeef  eee ee e	dddZ  ZS )CharBPETokenizera	   Original BPE Tokenizer

        Represents the BPE algorithm, as introduced by Rico Sennrich
        (https://arxiv.org/abs/1508.07909)

        The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
        Sennrich subword-nmt implementation by the following options that you can deactivate:
            - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
                * removing any control characters and replacing all whitespaces by the classic one.
                * handle chinese chars by putting spaces around them.
                * strip all accents.
            - spitting on punctuation in addition to whitespaces (deactivate it with
              `split_on_whitespace_only=True`)
    N<unk></w>FT)	
vocab_filemerges_file	unk_tokensuffixdropout	lowercaseunicode_normalizerbert_normalizersplit_on_whitespace_onlyc
              	      s  |d k	r,|d k	r,t t|||t||d}
n
t t }
|
t|d k	rX|
t|g g }|rn|t|g7 }|r|tddg7 }|r|t g7 }t|dkrt|dkrt	||
_
n
|d |
_
|	rt |
_n
t |
_tj|d|
_d|||||||	d}t |
| d S )	N)r   r   end_of_word_suffixF)r   r   r   )r   r   )modelr   r   r   r   r   r   r   )r   r   strZtoken_to_idZadd_special_tokensr
   r   r	   lenr   Z
normalizerr   ZWhitespaceSplitZpre_tokenizerZBertPreTokenizerr   Z
BPEDecoderdecodersuper__init__)selfr   r   r   r   r   r   r   r   r   	tokenizernormalizers
parameters	__class__ M/tmp/pip-unpacked-wheel-zq7pv0lh/tokenizers/implementations/char_level_bpe.pyr$      sL    



zCharBPETokenizer.__init__i0u  r   i  )files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetr   show_progressc	           
   	   C   s:   t j|||||||d}	t|tr(|g}| j|	| dS )z' Train the model using the given files )r.   r/   r0   r1   r2   r   r3   N)r   Z
BpeTrainer
isinstancer    
_tokenizertrain)
r%   r-   r.   r/   r0   r1   r2   r   r3   Ztrainerr+   r+   r,   r6   \   s    	
zCharBPETokenizer.train)	NNr   r   NFNTF)__name__
__module____qualname____doc__r   r    r   r   floatboolr$   r   intr6   __classcell__r+   r+   r)   r,   r   	   sL            
Fr   N) r   r   r   r   r   modelsr   r'   r   r	   r
   r   Zbase_tokenizerr   typingr   r   r   r   r+   r+   r+   r,   <module>   s
   