U
    &c                     @   s   d dl Z d dlZd dlZd dlmZmZmZmZ d dlZddl	m
Z
 ddlmZmZ dddd	d
ZdZdd e D ZG dd deZeeeef dddZdS )    N)DictListOptionalUnion   S3_BUCKET_PREFIX)BatchEncodingPreTrainedTokenizerz
source.spmz
target.spmz
vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabZtokenizer_config_file)zopus-mt-en-dec                    s$   i | ]\} | fd dt D qS )c                    s"   i | ]}|t  d | d  qS )z/Helsinki-NLP//r   .0mfname D/tmp/pip-unpacked-wheel-ymerj3tt/transformers/tokenization_marian.py
<dictcomp>   s      z<dictcomp>.<dictcomp>)MODEL_NAMESr   kr   r   r   r      s    r   c                	       s  e Zd ZdZeZeZdd eD ZdgZ	e
dZd* fdd	ZeedddZdd ZedddZeee dddZeedddZee edddZd+ee ddd Zee dd!d"Zd,ee eee  ee eeed%d&d'Zeedd(d)Z  ZS )-MarianTokenizera  Sentencepiece tokenizer for marian. Source and target languages have different SPM models.
    The logic is use the relevant source_spm or target_spm to encode txt as pieces, then look up each piece in a vocab dictionary.

    Examples::

        from transformers import MarianTokenizer
        tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
        src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
        tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
        batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
        # keys  [input_ids, attention_mask, decoder_input_ids,  decoder_attention_mask].
        # model(**batch) should work
    c                 C   s   i | ]
}|d qS )   r   r   r   r   r   r   +   s      zMarianTokenizer.<dictcomp>Zattention_maskz>>.+<<N<unk></s><pad>r   c
                    s   t  j|	|||d t|| _| j| jkr2td| j| jksBtdd | j D | _	|| _
|| _t | _| j| t | _| j| dd | jD | _zddlm}
 |
|| _W n( tk
r   td	 d
d | _Y nX d S )N)max_len	eos_token	unk_token	pad_tokenz<unk> token must be in vocabc                 S   s   i | ]\}}||qS r   r   )r   r   vr   r   r   r   G   s      z,MarianTokenizer.__init__.<locals>.<dictcomp>c                 S   s$   g | ]}| d r|dr|qS )z>>z<<)
startswithendswithr   r   r   r   
<listcomp>T   s     
 
 z,MarianTokenizer.__init__.<locals>.<listcomp>r   )MosesPunctuationNormalizerz'Recommended: pip install mosestokenizerc                 S   s   | S Nr   )xr   r   r   <lambda>\       z*MarianTokenizer.__init__.<locals>.<lambda>)super__init__	load_jsonencoderr!   KeyErrorr"   AssertionErroritemsdecodersource_langtarget_langsentencepieceZSentencePieceProcessor
spm_sourceLoad
spm_targetZsupported_language_codesZmosestokenizerr'   punc_normalizerImportErrorwarningswarn)selfr   r   r   r4   r5   r!   r    r"   r   r'   	__class__r   r   r-   /   s0    



zMarianTokenizer.__init__)r)   returnc                 C   s   |r|  |S dS )zHCover moses empty string edge case. They return empty list for '' input! )r:   )r>   r)   r   r   r   	normalize^   s    zMarianTokenizer.normalizec                 C   s   | j || j | j S r(   )r/   getr!   )r>   tokenr   r   r   _convert_token_to_idb   s    z$MarianTokenizer._convert_token_to_id)textc                 C   s2   | j |}|r|dgng }|| j d|fS )z6Remove language codes like <<fr>> before sentencepiecer   rB   )language_code_rematchgroupsub)r>   rG   rI   coder   r   r   remove_language_codee   s    z$MarianTokenizer.remove_language_code)rG   rA   c                 C   s"   |  |\}}| j|}|| S r(   )rM   current_spmZEncodeAsPieces)r>   rG   rL   piecesr   r   r   	_tokenizek   s    zMarianTokenizer._tokenize)indexrA   c                 C   s   | j || jS )z?Converts an index (integer) in a token (str) using the encoder.)r3   rD   r!   )r>   rQ   r   r   r   _convert_id_to_tokenp   s    z$MarianTokenizer._convert_id_to_token)tokensrA   c                 C   s   | j |S )z(Uses target language sentencepiece model)r9   ZDecodePieces)r>   rS   r   r   r   convert_tokens_to_stringt   s    z(MarianTokenizer.convert_tokens_to_string)rA   c                 C   s$   |dkr|| j g S || | j g S )z=Build model inputs from a sequence by appending eos_token_id.N)Zeos_token_id)r>   Ztoken_ids_0Ztoken_ids_1r   r   r    build_inputs_with_special_tokensx   s    z0MarianTokenizer.build_inputs_with_special_tokensc                    s    fdd|D S )Nc                    s   g | ]}j |f qS r   )decode)r   idskwargsr>   r   r   r&      s     z0MarianTokenizer.batch_decode.<locals>.<listcomp>r   )r>   Z	token_idsrY   r   rX   r   batch_decode   s    zMarianTokenizer.batch_decodeTpt)	src_texts	tgt_texts
max_lengthpad_to_max_lengthreturn_tensorsrA   c           
         s   d|krt d|  j _ fdd|D } j|d|||d}|dkrP|S  j _ j|d|||d}| D ]\}}	|	|d| < qt j _|S )	a  Prepare model inputs for translation. For best performance, translate one sentence at a time.
        Arguments:
            src_texts: list of src language texts
            tgt_texts: list of tgt language texts
            max_length: (None) defer to config (1024 for mbart-large-en-ro)
            pad_to_max_length: (bool)
            return_tensors: (str) default "pt" returns pytorch tensors, pass None to return lists.

        Returns:
            BatchEncoding: with keys [input_ids, attention_mask, decoder_input_ids,  decoder_attention_mask]
            all shaped bs, seq_len. (BatchEncoding is a dict of string -> tensor or lists).
            If no tgt_text is specified, the only keys will be input_ids and attention_mask.
        rB   z!found empty string in src_texts: c                    s   g | ]}  |qS r   )rC   )r   tr>   r   r   r&      s     z=MarianTokenizer.prepare_translation_batch.<locals>.<listcomp>T)Zadd_special_tokensr`   r^   r_   NZdecoder_)
ValueErrorr7   rN   Zbatch_encode_plusr9   r2   )
r>   r\   r]   r^   r_   r`   Zmodel_inputsZdecoder_inputsr   r#   r   rb   r   prepare_translation_batch   s2    z)MarianTokenizer.prepare_translation_batchc                 C   s
   t | jS r(   )lenr/   rb   r   r   r   
vocab_size   s    zMarianTokenizer.vocab_size)	NNNNNr   r   r   r   )N)NNTr[   ) __name__
__module____qualname____doc__vocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_mapr   Zmax_model_input_sizesZmodel_input_namesrecompilerH   r-   strrC   rF   rM   r   rP   intrR   rT   rU   rZ   r   boolr	   rd   propertyrf   __classcell__r   r   r?   r   r      sL   
         /    
0r   )pathrA   c              
   C   s,   t | d}t|W  5 Q R  S Q R X d S )Nr)openjsonload)rt   fr   r   r   r.      s    r.   )rw   rm   r<   typingr   r   r   r   r6   Z
file_utilsr   Ztokenization_utilsr	   r
   rk   r   r2   rl   r   ro   r.   r   r   r   r   <module>   s$    