U
    &c&                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZ eeZddiZddd	d
ddiZdddddZdddddddddddddddddZG dd de	ZG dd dZG dd deZdS )zTokenization classes.    N)Optional   )BasicTokenizerBertTokenizerWordpieceTokenizer
load_vocab
vocab_filez	vocab.txtzZhttps://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese/vocab.txtzmhttps://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking/vocab.txtz_https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char/vocab.txtzrhttps://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking/vocab.txt)zbert-base-japanesez%bert-base-japanese-whole-word-maskingzbert-base-japanese-charz*bert-base-japanese-char-whole-word-maskingi   Fmecab	wordpiece)do_lower_caseword_tokenizer_typesubword_tokenizer_type	characterc                       s:   e Zd ZdZeZeZeZ	e
Zd fdd	Zdd Z  ZS )BertJapaneseTokenizerz BERT tokenizer for Japanese textFTbasicr
   N[UNK][SEP][PAD][CLS][MASK]c                    s  t t| jf ||	|
||d| tj|s<td|t|| _	t
dd | j	 D | _|| _|r|dkrt||dd| _n2|dkrtf ||d	|pi | _ntd
||| _|r
|dkrt| j	| jd| _n*|dkrt| j	| jd| _ntd|dS )a  Constructs a MecabBertTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
                Only has an effect when do_basic_tokenize=True.
            **do_word_tokenize**: (`optional`) boolean (default True)
                Whether to do word tokenization.
            **do_subword_tokenize**: (`optional`) boolean (default True)
                Whether to do subword tokenization.
            **word_tokenizer_type**: (`optional`) string (default "basic")
                Type of word tokenizer.
            **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
                Type of subword tokenizer.
            **mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None)
        )	unk_token	sep_token	pad_token	cls_token
mask_tokenzCan't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`c                 S   s   g | ]\}}||fqS  r   ).0tokidsr   r   K/tmp/pip-unpacked-wheel-ymerj3tt/transformers/tokenization_bert_japanese.py
<listcomp>   s     z2BertJapaneseTokenizer.__init__.<locals>.<listcomp>r   F)r   never_splitZtokenize_chinese_charsr	   )r   r!   z.Invalid word_tokenizer_type '{}' is specified.r
   )vocabr   r   z1Invalid subword_tokenizer_type '{}' is specified.N)superr   __init__ospathisfile
ValueErrorformatr   r"   collectionsOrderedDictitemsZids_to_tokensdo_word_tokenizer   word_tokenizerMecabTokenizerdo_subword_tokenizer   r   subword_tokenizerCharacterTokenizer)selfr   r   r-   r0   r   r   r!   r   r   r   r   r   Zmecab_kwargskwargs	__class__r   r   r$   O   sP    "

  
 
zBertJapaneseTokenizer.__init__c                    sB    j r jj| jd}n|g} jr: fdd|D }n|}|S )N)r!   c                    s"   g | ]} j |D ]}|qqS r   )r1   tokenize)r   tokenZ	sub_tokenr3   r   r   r       s       z3BertJapaneseTokenizer._tokenize.<locals>.<listcomp>)r-   r.   r7   Zall_special_tokensr0   )r3   texttokensZsplit_tokensr   r9   r   	_tokenize   s    zBertJapaneseTokenizer._tokenize)FTTr   r
   Nr   r   r   r   r   N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESZvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_mapPRETRAINED_INIT_CONFIGURATIONZpretrained_init_configuration&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesr$   r<   __classcell__r   r   r5   r   r   G   s&               Jr   c                   @   s.   e Zd ZdZd
ee dddZddd	ZdS )r/   z8Runs basic tokenization with MeCab morphological parser.FNT)mecab_optionc                 C   sF   || _ |dk	r|ng | _|| _ddl}|dk	r8||n| | _dS )a  Constructs a MecabTokenizer.

        Args:
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
            **never_split**: (`optional`) list of str
                Kept for backward compatibility purposes.
                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
                List of token not to split.
            **normalize_text**: (`optional`) boolean (default True)
                Whether to apply unicode normalization to text before tokenization.
            **mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "")
        Nr   )r   r!   normalize_textMeCabZTaggerr	   )r3   r   r!   rG   rF   rH   r   r   r   r$      s
    zMecabTokenizer.__init__c                 K   s   | j rtd|}| j|dk	r"|ng  }g }| j|}d}|dD ]Z}|dkrV q|d\}}	|||}
|
t| }| j	r||kr|
 }|| |}qF|S )zTokenizes a piece of text.NFKCNr   
ZEOS	)rG   unicodedata	normalizer!   r	   parsesplitindexlenr   lowerappend)r3   r:   r!   r4   r;   Zmecab_outputcursorliner8   _Ztoken_startZ	token_endr   r   r   r7      s"    
zMecabTokenizer.tokenize)FNTN)N)r=   r>   r?   r@   r   strr$   r7   r   r   r   r   r/      s   r/   c                   @   s"   e Zd ZdZdddZdd ZdS )	r2   zRuns Character tokenziation.Tc                 C   s   || _ || _|| _dS )ag  Constructs a CharacterTokenizer.

        Args:
            **vocab**:
                Vocabulary object.
            **unk_token**: str
                A special symbol for out-of-vocabulary token.
            **normalize_text**: (`optional`) boolean (default True)
                Whether to apply unicode normalization to text before tokenization.
        N)r"   r   rG   )r3   r"   r   rG   r   r   r   r$      s    zCharacterTokenizer.__init__c                 C   sN   | j rtd|}g }t|D ]*\}}|| jkr>|| j q|| q|S )ae  Tokenizes a piece of text into characters.

        For example:
            input = "apple"
            output = ["a", "p", "p", "l", "e"]
        Args:
            text: A single token or whitespace separated tokens.
                This should have already been passed through `BasicTokenizer`.
        Returns:
            A list of characters.
        rI   )rG   rL   rM   	enumerater"   rS   r   )r3   r:   Zoutput_tokensicharr   r   r   r7      s    
zCharacterTokenizer.tokenizeN)T)r=   r>   r?   r@   r$   r7   r   r   r   r   r2      s   
r2   )r@   r*   loggingr%   rL   typingr   Ztokenization_bertr   r   r   r   	getLoggerr=   loggerrA   rB   rD   rC   r   r/   objectr2   r   r   r   r   <module>   sP   

	`4