U
    &c^                     @   s   d Z ddlZddlZddlZddlmZ eeZdddZ	dd	d
ddddddddZ
dddddZddiddiddiddidZdd ZG dd deZdS )z0Tokenization classes for Flaubert, based on XLM.    N   )XLMTokenizerz
vocab.jsonz
merges.txt)Z
vocab_fileZmerges_filez\https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.jsonz]https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.jsonz[https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.jsonz\https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json)zflaubert-small-casedzflaubert-base-uncasedzflaubert-base-casedzflaubert-large-casedz\https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txtz]https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txtz[https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txtz\https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txti   do_lowercaseFTc                 C   s   ddd}|| dddS )zQ
    Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
    utf-8strictc                 S   s<   t | tjr| ||S t | tjr(| S tdt|  d S )Nznot expecting type '%s')
isinstancesixbinary_typedecode	text_type	TypeErrortype)sencodingerrors r   F/tmp/pip-unpacked-wheel-ymerj3tt/transformers/tokenization_flaubert.pysix_ensure_textD   s
    z+convert_to_unicode.<locals>.six_ensure_textignore)r   r   )r   r   r   )textr   r   r   r   convert_to_unicode?   s    
r   c                       sD   e Zd ZdZeZeZeZ	e
Zd	 fdd	Zdd Zd
ddZ  ZS )FlaubertTokenizera  
    BPE tokenizer for Flaubert

    - Moses preprocessing & tokenization
    - Normalize all inputs text
    - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols       (ex: "__classify__") to a vocabulary
    - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)

    This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
    and documentation regarding arguments.
    Fc                    s   t  jf | || _d| _d S )NF)super__init__r   Zdo_lowercase_and_remove_accent)selfr   kwargs	__class__r   r   r   b   s    zFlaubertTokenizer.__init__c                 C   s:   | dd dd}t|}td|}| jr6| }|S )Nz``"z''NFC)replacer   unicodedata	normalizer   lower)r   r   r   r   r   preprocess_textg   s    z!FlaubertTokenizer.preprocess_textc                 C   s   d}|r"| j r"|| j kr"td |r0| }n&| |}| j||d}| j||d}g }|D ](}|r^|dd | |dD  q^|S )a  
        Tokenize a string given language code using Moses.

        Details of tokenization:
        - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
            - Install with `pip install sacremoses`

        Args:
            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.

        Returns:
            List of tokens.
        frzSupplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.)langc                 S   s   g | ]}|qS r   r   ).0tr   r   r   
<listcomp>   s     z/FlaubertTokenizer._tokenize.<locals>.<listcomp> )	Zlang2idloggererrorsplitr$   Zmoses_pipelineZmoses_tokenizeextendZbpe)r   r   Zbypass_tokenizerr&   Zsplit_tokenstokenr   r   r   	_tokenizeq   s    

"zFlaubertTokenizer._tokenize)F)F)__name__
__module____qualname____doc__VOCAB_FILES_NAMESZvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_mapPRETRAINED_INIT_CONFIGURATIONZpretrained_init_configuration&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesr   r$   r0   __classcell__r   r   r   r   r   O   s   
r   )r4   loggingr!   r   Ztokenization_xlmr   	getLoggerr1   r+   r5   r6   r8   r7   r   r   r   r   r   r   <module>   s>   
