U
    #c                     @   s|   d dl mZmZmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ ddlmZ d dlmZmZmZ G d	d
 d
eZdS )    )	Tokenizer
AddedTokendecoderstrainers)	WordPiece)BertNormalizer)BertPreTokenizer)BertProcessing   )BaseTokenizer)OptionalListUnionc                       s   e Zd ZdZdee eeef eeef eeef eeef eeef eeeeed
 fddZ	dddg dddddgdd	feee
e f eeee
e e
eeef  eedddZ  ZS )BertWordPieceTokenizerz Bert WordPiece Tokenizer N[UNK][SEP][CLS][PAD][MASK]T##)
vocab_file	unk_token	sep_token	cls_token	pad_token
mask_token
clean_texthandle_chinese_charsstrip_accents	lowercasewordpieces_prefixc                    s  |d k	rt t|t|d}n
t t }|t|d k	rJ|t|g |t|d k	rl|t|g |t|d k	r|t|g |t|d k	r|t|g |t|d k	r|t|g t|||	|
d|_t |_|d k	rR|t|}|d krt	d|t|}|d kr6t	dt
t||ft||f|_tj|d|_d||||||||	|
|d}t || d S )N)r   )r   r   r   r   z%sep_token not found in the vocabularyz%cls_token not found in the vocabulary)prefixZBertWordPiece)modelr   r   r   r   r   r   r   r   r   r    )r   r   strZtoken_to_idZadd_special_tokensr   Z
normalizerr   Zpre_tokenizer	TypeErrorr	   Zpost_processorr   decodersuper__init__)selfr   r   r   r   r   r   r   r   r   r   r    	tokenizerZsep_token_idZcls_token_id
parameters	__class__ M/tmp/pip-unpacked-wheel-zq7pv0lh/tokenizers/implementations/bert_wordpiece.pyr'      sZ    




 
zBertWordPieceTokenizer.__init__i0u     i  )files
vocab_sizemin_frequencylimit_alphabetinitial_alphabetspecial_tokensshow_progressr    c	           
   	   C   s:   t j|||||||d}	t|tr(|g}| j|	| dS )z' Train the model using the given files )r1   r2   r3   r4   r5   r6   Zcontinuing_subword_prefixN)r   ZWordPieceTrainer
isinstancer#   
_tokenizertrain)
r(   r0   r1   r2   r3   r4   r5   r6   r    Ztrainerr-   r-   r.   r9   S   s    	
zBertWordPieceTokenizer.train)Nr   r   r   r   r   TTTTr   )__name__
__module____qualname____doc__r   r#   r   r   boolr'   r   intr9   __classcell__r-   r-   r+   r.   r      s^              




Hr   N)Z
tokenizersr   r   r   r   Ztokenizers.modelsr   Ztokenizers.normalizersr   Ztokenizers.pre_tokenizersr   Ztokenizers.processorsr	   Zbase_tokenizerr   typingr   r   r   r   r-   r-   r-   r.   <module>   s   