U
    #c	                     @   sh   d dl mZmZmZmZmZ d dlmZ d dlm	Z	 ddl
mZ d dlmZmZmZ G dd deZd	S )
    )	Tokenizer
AddedTokenpre_tokenizersdecoderstrainers)BPE)NFKC   )BaseTokenizer)OptionalListUnionc                	       s   e Zd ZdZdee ee eeef eeee	 d fddZ
d	d
dgdg dfeeee f eeeeeef  eee edddZ  ZS )SentencePieceBPETokenizerzs SentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    N<unk>   ▁T)
vocab_filemerges_file	unk_tokenreplacementadd_prefix_spacedropoutc           	         s   |d k	r&|d k	r&t t||||d}n
t t }|t|d k	rR|t|g t |_tj||d|_	t
j||d|_d||||d}t || d S )N)r   r   )r   r   ZSentencePieceBPE)modelr   r   r   r   )r   r   Ztoken_to_idstrZadd_special_tokensr   Z
normalizerr   Z	MetaspaceZpre_tokenizerr   decodersuper__init__)	selfr   r   r   r   r   r   	tokenizer
parameters	__class__ P/tmp/pip-unpacked-wheel-zq7pv0lh/tokenizers/implementations/sentencepiece_bpe.pyr      s.    	
  z"SentencePieceBPETokenizer.__init__i0u     i  )files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc           	      C   s8   t j||||||d}t|tr&|g}| j|| dS )z' Train the model using the given files )r%   r&   r'   r(   r)   r*   N)r   Z
BpeTrainer
isinstancer   
_tokenizertrain)	r   r$   r%   r&   r'   r(   r)   r*   Ztrainerr!   r!   r"   r-   4   s    
zSentencePieceBPETokenizer.train)NNr   r   TN)__name__
__module____qualname____doc__r   r   r   r   boolfloatr   r   intr-   __classcell__r!   r!   r   r"   r   	   s<         
(r   N)Z
tokenizersr   r   r   r   r   Ztokenizers.modelsr   Ztokenizers.normalizersr   Zbase_tokenizerr
   typingr   r   r   r   r!   r!   r!   r"   <module>   s
   