U
    &ºcün  ã                   @   sœ  d Z ddlZddlZddlZddlZddlZddlmZmZ ddl	m
Z
 ddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddl m!Z!m"Z" eƒ rÖddl#Z#e $e%¡Z&dddœZ'dddœZ(dddiiZ)dddiiZ*ddiZ+ddiZ,dZ-G dd„ de!ƒZ.G dd„ deƒZ/G dd„ de"ƒZ0G dd„ de1ƒZ2G d d!„ d!e1ƒZ3G d"d#„ d#e3ƒZ4G d$d%„ d%e1ƒZ5d&d'„ Z6dS )(zn Tokenization classes for Transformer XL model.
    Adapted from https://github.com/kimiyoung/transformer-xl.
é    N)ÚCounterÚOrderedDict)ÚOptional)Ú	Tokenizer)ÚBaseTokenizer)Ú	WordLevel)Ú	LowercaseÚSequenceÚStripÚunicode_normalizer_from_str)ÚCharDelimiterSplitÚWhitespaceSplit)ÚBertProcessingé   )Úcached_pathÚis_torch_available)ÚPreTrainedTokenizerÚPreTrainedTokenizerFastz	vocab.binz	vocab.txt)Úpretrained_vocab_fileÚ
vocab_filez
vocab.jsonr   ztransfo-xl-wt103zNhttps://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.binzOhttps://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.jsonzOhttps://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.binz
corpus.binc                       sè   e Zd ZdZeZeZeZ	g Z
dddddddddddgf‡ fdd	„	Zd
d„ Zd/dd„Zd0dd„Zdd„ Zdd„ Zdd„ Zd1dd„Zd2dd„Zdd„ Zdd„ Zdd „ Zd!d"„ Zd#d$„ Zd%d&„ Zed'd(„ ƒZd)d*„ Zd3d+d,„Zd-d.„ Z‡  ZS )4ÚTransfoXLTokenizera,  
    Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.
    Nr   Fú<unk>ú<eos>ú	<formula>c                    s  t ƒ jf |	|
|dœ|—Ž |d kr(| j}|d kr4g }tƒ | _|| _|| _|| _|| _|| _	|| _
|| _d| _t d | j¡¡| _|  ¡ | _zL|d k	rÄt |¡}| ¡ D ]\}}|| jkr¦|| j|< q¦|d k	rÔ|  ¡  W n" tk
rø   td |¡ƒ‚Y nX |d k	r|  ¡  d S )N©Ú	unk_tokenÚ	eos_tokenÚadditional_special_tokensz !"#$%&()*+,-./\:;<=>?@[\]^_`{|}~z	[^\s][{}]zUnable to parse file {}. Unknown format. If you tried to load a model saved through TransfoXLTokenizerFast,please note they are not compatible.)ÚsuperÚ__init__Zall_special_tokensr   ÚcounterÚspecialÚmin_freqÚmax_sizeÚ
lower_caseÚ	delimiterr   Únever_splitÚpunctuation_symbolsÚreÚcompileÚformatÚ%punction_without_space_before_patternÚ)_compile_space_around_punctuation_patternÚ%punctuation_with_space_around_patternÚtorchÚloadÚitemsÚ__dict__Úbuild_vocabÚ	ExceptionÚ
ValueError)Úselfr!   r"   r#   r$   r%   r   r   r&   r   r   r   ÚkwargsZ
vocab_dictÚkeyÚvalue©Ú	__class__© úH/tmp/pip-unpacked-wheel-ymerj3tt/transformers/tokenization_transfo_xl.pyr   U   sL      ÿÿ


þÿ

zTransfoXLTokenizer.__init__c                 C   s"   d  | j¡}d}t d| | ¡S )Nz(?=[{}])z	(?=[^\s])Ú )r*   r'   r(   r)   )r5   Zlook_ahead_for_special_tokenZ$look_ahead_to_match_all_except_spacer;   r;   r<   r,      s    z<TransfoXLTokenizer._compile_space_around_punctuation_patternc           	   	   C   s¤   |rt  d |¡¡ tj |¡s$t‚g }t|dddd}t|ƒD ]T\}}|rp|dkrp|d dkrpt  d |¡¡ | j	||d}| j
 |¡ | |¡ q@W 5 Q R X |S )	Nzcounting file {} ...Úrúutf-8©Úencodingr   é ¡ ú    line {})Úadd_eos)ÚloggerÚinfor*   ÚosÚpathÚexistsÚAssertionErrorÚopenÚ	enumerateÚtokenizer    ÚupdateÚappend)	r5   rH   ÚverboserD   ÚsentsÚfÚidxÚlineÚsymbolsr;   r;   r<   Ú
count_file’   s    zTransfoXLTokenizer.count_filec                 C   sb   |rt  d t|ƒ¡¡ t|ƒD ]<\}}|rP|dkrP|d dkrPt  d |¡¡ | j |¡ q dS )zS
            sents : a list of sentences, each a list of tokenized symbols
        zcounting {} sents ...r   rB   rC   N)rE   rF   r*   ÚlenrL   r    rN   )r5   rQ   rP   rS   rU   r;   r;   r<   Úcount_sents¢   s    zTransfoXLTokenizer.count_sentsc              	   C   sˆ   g | _ tƒ | _t|ddd*}|D ]}| ¡  ¡ d }|  |¡ q"W 5 Q R X d| jkrd| jd | _n d| jkr|| jd | _ntdƒ‚d S )Nr>   r?   r@   r   ú<UNK>r   zNo <unkown> token in vocabulary)	Úidx2symr   Úsym2idxrK   ÚstripÚsplitÚ
add_symbolÚunk_idxr4   )r5   r   rR   rT   Zsymbr;   r;   r<   Ú_build_from_file­   s    

z#TransfoXLTokenizer._build_from_filec                 C   sB   t  d¡ tj |¡r*tj |td ¡}n|}t | j	|¡ |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            vocab_path (:obj:`str`):
                The directory in which to save the vocabulary.

        Returns:
            :obj:`Tuple(str)`: Paths to the files saved.
        z‰Please note you will not be able to load the save vocabulary in Rust-based TransfoXLTokenizerFast as they don't share the same structure.r   )
rE   ÚwarningrG   rH   ÚisdirÚjoinÚVOCAB_FILES_NAMESr.   Úsaver1   )r5   Z
vocab_pathr   r;   r;   r<   Úsave_vocabulary¼   s    ÿz"TransfoXLTokenizer.save_vocabularyc                 C   sÄ   | j r:t d | j ¡¡ |  | j ¡ t d t| ƒ¡¡ n†t d | j| j¡¡ g | _t	ƒ | _
| jD ]}|  |¡ qd| j | j¡D ] \}}|| jk r˜ q¤|  |¡ q‚t d t| ƒt| jƒ¡¡ d S )Nzbuilding vocab from {}zfinal vocab size {}z,building vocab with min_freq={}, max_size={}z)final vocab size {} from {} unique tokens)r   rE   rF   r*   r`   rW   r"   r#   rZ   r   r[   r!   Úadd_specialr    Úmost_commonr^   )r5   ÚsymZcntr;   r;   r<   r2   Ô   s    

zTransfoXLTokenizer.build_vocabTc              	   C   s®   |rt  d |¡¡ tj |¡s$t‚g }t|ddd`}t|ƒD ]P\}}	|rp|dkrp|d dkrpt  d |¡¡ | j	|	||d}
| 
|  |
¡¡ q@W 5 Q R X |rªt |¡}|S )	Nzencoding file {} ...r>   r?   r@   r   rB   rC   )rD   Úadd_double_eos)rE   rF   r*   rG   rH   rI   rJ   rK   rL   rM   rO   Úconvert_to_tensorr.   Úcat)r5   rH   ÚorderedrP   rD   rj   ÚencodedrR   rS   rT   rU   r;   r;   r<   Úencode_fileè   s    
zTransfoXLTokenizer.encode_filec                 C   sx   |rt  d t|ƒ¡¡ g }t|ƒD ]@\}}|rT|dkrT|d dkrTt  d |¡¡ | |  |¡¡ q$|rtt |¡}|S )Nzencoding {} sents ...r   rB   rC   )	rE   rF   r*   rW   rL   rO   rk   r.   rl   )r5   rQ   rm   rP   rn   rS   rU   r;   r;   r<   Úencode_sentsù   s    
zTransfoXLTokenizer.encode_sentsc                 C   sL   || j krH| j |¡ t| jƒd | j |< t| d | d¡¡| j | ƒ d S )Nr   z{}_idxz<>)r[   rZ   rO   rW   Úsetattrr*   r\   ©r5   ri   r;   r;   r<   rg     s    
zTransfoXLTokenizer.add_specialc                 C   s.   || j kr*| j |¡ t| jƒd | j |< d S )Nr   )r[   rZ   rO   rW   rr   r;   r;   r<   r^     s    
zTransfoXLTokenizer.add_symbolc                 C   s2   d|  krt | ƒk s(n td |¡ƒ‚| j| S )z0Converts an id in a token (BPE) using the vocab.r   z Index {} out of vocabulary range)rW   rJ   r*   rZ   )r5   rS   r;   r;   r<   Ú_convert_id_to_token  s    (z'TransfoXLTokenizer._convert_id_to_tokenc                 C   sb   || j kr| j | S t| dƒr.| j  || j¡S d| j krB| j d S d| j krV| j d S tdƒ‚dS )z2 Converts a token (str) in an id using the vocab. r_   r   rY   zHToken not in vocabulary and no <unk> token in vocabulary for replacementN)r[   ÚhasattrÚgetr_   r4   rr   r;   r;   r<   Ú_convert_token_to_id  s    






z'TransfoXLTokenizer._convert_token_to_idc                 C   s   d  |¡ ¡ }|S )z< Converts a sequence of tokens (string) in a single string. ú )rc   r\   )r5   ÚtokensZ
out_stringr;   r;   r<   Úconvert_tokens_to_string(  s    z+TransfoXLTokenizer.convert_tokens_to_stringc                 C   s   t  |  |¡¡S ©N)r.   Ú
LongTensorZconvert_tokens_to_ids)r5   rU   r;   r;   r<   rk   -  s    z$TransfoXLTokenizer.convert_to_tensorc                 C   s
   t | jƒS rz   )rW   rZ   ©r5   r;   r;   r<   Ú
vocab_size0  s    zTransfoXLTokenizer.vocab_sizec                 C   s   t | jf| jŽS rz   )Údictr[   Zadded_tokens_encoderr|   r;   r;   r<   Ú	get_vocab4  s    zTransfoXLTokenizer.get_vocabc                 C   s\   |  ¡ }| jr| ¡ }| jdkr&|}n| | j¡}|rFdg| dg S |rT|dg S |S d S )Nr=   z<S>r   )r\   r$   Úlowerr%   r]   )r5   rT   rD   rj   rU   r;   r;   r<   Ú	_tokenize7  s    

zTransfoXLTokenizer._tokenizec                 K   s:   d|kr |d r | j  d|¡}n| j |¡r6t d¡ |S )NZadd_space_before_punct_symbolrw   zºYou might want to consider setting `add_space_before_punct_symbol=True` as an argument to the `tokenizer.encode()` to avoid tokenizing words with punctuation symbols to the `<unk>` token)r-   Úsubr+   ÚsearchrE   ra   )r5   Útextr6   r;   r;   r<   Úprepare_for_tokenizationJ  s    ÿz+TransfoXLTokenizer.prepare_for_tokenization)FF)F)FFTF)FF)FF) Ú__name__Ú
__module__Ú__qualname__Ú__doc__rd   Úvocab_files_namesÚPRETRAINED_VOCAB_FILES_MAPÚpretrained_vocab_files_mapÚ&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESÚmax_model_input_sizesÚmodel_input_namesr   r,   rV   rX   r`   rf   r2   ro   rp   rg   r^   rs   rv   ry   rk   Úpropertyr}   r   r   r…   Ú__classcell__r;   r;   r9   r<   r   H   sH   ô8





r   c                       s(   e Zd Zdee dœ‡ fdd„Z‡  ZS )Ú"_TransfoXLDelimiterLookupTokenizerFN)Únormalizationc	                    sü   zt ||d}	t|	ƒ}	W n" tk
r:   td |¡ƒ‚Y nX g }
|rR|
t|ƒg7 }
|rb|
tƒ g7 }
|
tdddg7 }
t|
ƒdkržt|
ƒdkr”t	|
ƒn|
d |	_
|rªt|ƒntƒ |	_|rÖt||	 |¡f||	 |¡fƒ|	_d||||||dœ}tƒ  |	|¡ d S )	N)r   z‹Unable to parse file {}. Unknown format. If you tried to load a model saved through TransfoXLTokenizer,please note they are not compatible.T)ÚleftÚrightr   r   ZTransfoXLModel)ÚmodelrD   rj   r   r   r%   Ú	lowercase)r   r   r3   r4   r*   r   r   r
   rW   r	   Ú
normalizerr   r   Zpre_tokenizerr   Ztoken_to_idZpost_processorr   r   )r5   r   r%   r—   r   r   rD   rj   r“   Ú	tokenizerr˜   Ú
parametersr9   r;   r<   r   Y  s@    þÿ
 ÿù
z+_TransfoXLDelimiterLookupTokenizer.__init__)FFN)r†   r‡   rˆ   r   Ústrr   r‘   r;   r;   r9   r<   r’   X  s      ÷	÷r’   c                       s\   e Zd ZdZeZeZeZ	g Z
dddddddddddgdddf‡ fdd	„	Z‡ fd
d„Z‡  ZS )ÚTransfoXLTokenizerFastaÐ  
    Construct a "Fast" Transformer-XL tokenizer (backed by HuggingFace's `tokenizers` library).

    The Transformer-XL tokenizer is a word-level tokenizer (no sub-word tokenization).

    Adapted from Vocab class in https://github.com/kimiyoung/transformer-xl

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.
    Nr   Fr   r   r   c                    s8   t ƒ jt|p||||	|
|||df|	|
|dœ|—Ž d S )N)r   r%   r—   r   r   rD   rj   r“   r   )r   r   r’   )r5   r!   r"   r#   r$   r%   r   r   r&   r   r   r   rD   rj   r“   r6   r9   r;   r<   r   ¦  s$    øÿóòzTransfoXLTokenizerFast.__init__c                    s   t  d¡ tƒ  |¡S )Nz‚Please note you will not be able to load the vocabulary in Python-based TransfoXLTokenizer as they don't share the same structure.)rE   ra   r   Úsave_pretrained)r5   Zsave_directoryr9   r;   r<   r   Ê  s    ÿz&TransfoXLTokenizerFast.save_pretrained)r†   r‡   rˆ   r‰   ÚVOCAB_FILES_NAMES_FASTrŠ   ÚPRETRAINED_VOCAB_FILES_MAP_FASTrŒ   r   rŽ   r   r   r   r‘   r;   r;   r9   r<   rœ   •  s*   ñ$rœ   c                   @   s<   e Zd Zddd„Zddd„Zddd	„Zddd„Zdd„ ZdS )ÚLMOrderedIteratorÚcpuNc                 C   s€   || _ || _|dk	r|nd| _|| _| d¡| | _| dd| j| ¡}| |d¡ ¡  	¡  
|¡| _| j| j d | j | _dS )zN
            data -- LongTensor -- the LongTensor is strictly ordered
        Nr   éÿÿÿÿr   )ÚbszÚbpttÚext_lenÚdeviceÚsizeZn_stepZnarrowÚviewÚtÚ
contiguousÚtoÚdataZn_batch)r5   r¬   r£   r¤   r¦   r¥   r;   r;   r<   r   Ô  s    zLMOrderedIterator.__init__c           
      C   s¢   |d kr| j }t|| j d¡d | ƒ}|| }td|| j ƒ}| j||… }| j|d |d | … }| dd¡ ¡  | j	¡}| dd¡ ¡  | j	¡}	||	|fS ©Nr   r   )
r¤   Úminr¬   r§   Úmaxr¥   Ú	transposerª   r«   r¦   )
r5   Úir¤   Úseq_lenZend_idxZbeg_idxr¬   ÚtargetÚdata_outÚ
target_outr;   r;   r<   Ú	get_batchê  s    zLMOrderedIterator.get_batchr   c                 c   s0   t || j d¡d | jƒD ]}|  |¡V  qd S r­   )Úranger¬   r§   r¤   r¶   )r5   Ústartr±   r;   r;   r<   Úget_fixlen_iterú  s    z!LMOrderedIterator.get_fixlen_iteré   é   c              	   c   s   | j ||  }|}tj ¡ dk r&| j n| j d }t|t|ttj ||¡ƒƒƒ}|  ||¡\}}	}
||
7 }||	|
fV  || j 	d¡d krqŒqd S )Ngffffffî?g       @r   é   )
r¤   ÚnpÚrandomr®   r¯   ÚintÚnormalr¶   r¬   r§   )r5   r¸   ZstdZmin_lenZmax_deviationÚmax_lenr±   r¤   r¬   r³   r²   r;   r;   r<   Úget_varlen_iterþ  s    z!LMOrderedIterator.get_varlen_iterc                 C   s   |   ¡ S rz   )r¹   r|   r;   r;   r<   Ú__iter__
  s    zLMOrderedIterator.__iter__)r¡   N)N)r   )r   rº   rº   r»   )r†   r‡   rˆ   r   r¶   r¹   rÂ   rÃ   r;   r;   r;   r<   r    Ó  s
   



r    c                   @   s.   e Zd Zddd„Zdd„ Zdd	„ Zd
d„ ZdS )ÚLMShuffledIteratorr¡   NFc                 C   s4   || _ || _|| _|dk	r|nd| _|| _|| _dS )zY
            data -- list[LongTensor] -- there is no order among the LongTensors
        Nr   )r¬   r£   r¤   r¥   r¦   Úshuffle)r5   r¬   r£   r¤   r¦   r¥   rÅ   r;   r;   r<   r     s    zLMShuffledIterator.__init__c                 c   sF   | j rtj t| jƒ¡nt tt| jƒƒ¡}|D ]}| j| V  q0d S rz   )rÅ   r½   r¾   ZpermutationrW   r¬   Úarrayr·   )r5   Zepoch_indicesrS   r;   r;   r<   Úget_sent_stream  s    ,z"LMShuffledIterator.get_sent_streamc              	   c   sÞ  d g| j  }t | j| j ¡}t | j| j ¡}d}||d …  d¡ | d¡ d}t| j ƒD ]ê}d}z¾|| jk r|| d ksŒt|| ƒdkr˜t|ƒ||< tt|| ƒd | j| ƒ}	|| d |	… ||| || |	 …|f< || d|	d … ||||	 …|f< || |	d … ||< ||	7 }qdW qZ t	k
rB   d}Y  qFY qZX qZ|sPd S | 
dd¡ ¡  | j¡}
| 
dd¡ ¡  | j¡}|
|| jfV  t| d¡| jƒ}|dkrÀ|| d … |d |…< | || j | d¡¡ q0d S )Nr   r¢   Tr   F)r£   r.   r{   r¤   Zfill_r·   rW   Únextr®   ÚStopIterationr°   rª   r«   r¦   r§   r¥   Zresize_)r5   Úsent_streamZstreamsr¬   r³   Zn_retainZvalid_batchr±   Zn_filledZn_newr´   rµ   r;   r;   r<   Ústream_iterator$  s<    
($
z"LMShuffledIterator.stream_iteratorc                 c   s"   |   ¡ }|  |¡D ]
}|V  qd S rz   )rÇ   rË   )r5   rÊ   Úbatchr;   r;   r<   rÃ   S  s    zLMShuffledIterator.__iter__)r¡   NF)r†   r‡   rˆ   r   rÇ   rË   rÃ   r;   r;   r;   r<   rÄ     s   
/rÄ   c                   @   s&   e Zd Zd
dd„Zdd„ Zdd	„ ZdS )ÚLMMultiFileIteratorr¡   NFc                 C   s:   || _ || _|| _|| _|d k	r$|nd| _|| _|| _d S )Nr   )ÚpathsÚvocabr£   r¤   r¥   r¦   rÅ   )r5   rÎ   rÏ   r£   r¤   r¦   r¥   rÅ   r;   r;   r<   r   \  s    zLMMultiFileIterator.__init__c                 C   s.   | j j|dd}| jr"tj |¡ t|ƒ}|S )NT)rj   )rÏ   ro   rÅ   r½   r¾   Úiter)r5   rH   rQ   rÊ   r;   r;   r<   rÇ   h  s
    z#LMMultiFileIterator.get_sent_streamc                 c   sD   | j rtj  | j¡ | jD ]$}|  |¡}|  |¡D ]
}|V  q2qd S rz   )rÅ   r½   r¾   rÎ   rÇ   rË   )r5   rH   rÊ   rÌ   r;   r;   r<   rÃ   p  s    

zLMMultiFileIterator.__iter__)r¡   NF)r†   r‡   rˆ   r   rÇ   rÃ   r;   r;   r;   r<   rÍ   [  s   
rÍ   c                   @   s2   e Zd Zed
dd„ƒZdd„ Zdd„ Zdd	„ ZdS )ÚTransfoXLCorpusNc                 O   s>  t j|f|ž|Ž}|tkr$t| }ntj |t¡}zt||d}W n6 tk
rx   t	 
d |d t ¡ ¡||¡¡ Y dS X ||kr”t	 d |¡¡ nt	 d ||¡¡ | ||Ž}t |¡}	|	 ¡ D ]\}
}||j|
< qÂ||_|jdk	rútj|jtjd|_|jdk	rtj|jtjd|_|jdk	r:tj|jtjd|_|S )z5
        Instantiate a pre-processed corpus.
        )Ú	cache_dirz€Corpus '{}' was not found in corpus list ({}). We assumed '{}' was a path or url but couldn't find files {} at this path or url.z, Nzloading corpus file {}z'loading corpus file {} from cache at {})Zdtype)r   Úfrom_pretrainedÚPRETRAINED_CORPUS_ARCHIVE_MAPrG   rH   rc   ÚCORPUS_NAMEr   ÚEnvironmentErrorrE   Úerrorr*   ÚkeysrF   r.   r/   r0   r1   rÏ   ÚtrainZtensorÚlongÚvalidÚtest)ÚclsZpretrained_model_name_or_pathrÒ   Úinputsr6   rÏ   Zcorpus_fileZresolved_corpus_fileÚcorpusZcorpus_dictr7   r8   r;   r;   r<   rÓ   |  s>    
úÿ



zTransfoXLCorpus.from_pretrainedc                 O   s(   t ||Ž| _d | _d | _d | _d | _d S rz   )r   rÏ   ÚdatasetrÙ   rÛ   rÜ   )r5   Úargsr6   r;   r;   r<   r   ¨  s
    zTransfoXLCorpus.__init__c                 C   sÂ  || _ | j dkrT| j tj |d¡¡ | j tj |d¡¡ | j tj |d¡¡ nH| j dkrv| j tj |d¡¡ n&| j dkrœtj |ddd	¡}t |¡}| j ¡  | j d
kr| jjtj |d¡dd| _	| jjtj |d¡dd| _
| jjtj |d¡dd| _n¶| j dkrp| jjtj |d¡ddd| _	| jjtj |d¡ddd| _
| jjtj |d¡ddd| _nN| j dkr¾|| _	| jjtj |d¡ddd| _
| jjtj |d¡ddd| _d S )N)ÚptbÚwt2Úenwik8Útext8z	train.txtz	valid.txtztest.txtÚwt103Úlm1bz41-billion-word-language-modeling-benchmark-r13outputz'training-monolingual.tokenized.shuffledz	news.en-*)râ   rã   ræ   T)rm   ©rä   rå   F)rm   rD   )rm   rj   )rà   rÏ   rV   rG   rH   rc   Úglobr2   ro   rÙ   rÛ   rÜ   )r5   rH   rà   Ztrain_path_patternZtrain_pathsr;   r;   r<   Úbuild_corpus¯  s8    


ü

 zTransfoXLCorpus.build_corpusc                 O   s¦   |dkrP| j dkr&t| jf|ž|Ž}q¢| j dkr¢d|d< t| j| jf|ž|Ž}nR|dkr¢|dkrf| jn| j}| j dkrˆt|f|ž|Ž}n| j dkr¢t|f|ž|Ž}|S )NrÙ   )râ   rã   ræ   rä   rå   rç   TrÅ   )rÛ   rÜ   rÛ   )rà   r    rÙ   rÍ   rÏ   rÛ   rÜ   rÄ   )r5   r]   rá   r6   Z	data_iterr¬   r;   r;   r<   Úget_iteratorÑ  s    



zTransfoXLCorpus.get_iterator)N)r†   r‡   rˆ   ÚclassmethodrÓ   r   rê   rë   r;   r;   r;   r<   rÑ   {  s
   +"rÑ   c              	   C   s  t j | d¡}t j | d¡}t j |¡r>t d¡ t |¡}nÒt j |¡rvt d¡ t|dƒ}t	 |¡}W 5 Q R X nšt d 
|¡¡ i }|dkr¦dg|d	< d
|d< nP|dkrÂdg|d	< d|d< n4|dkrîg |d	< d
|d< t j | d¡|d< n|dkröt| |f|Ž}t ||¡ |S )Nzcache.ptz	cache.pklzLoading cached dataset...z%Loading cached dataset from pickle...ÚrbzProducing dataset {}...)ræ   rã   r   r!   Fr$   râ   Trç   z1b_word_vocab.txtr   rè   )rG   rH   rc   rI   rE   rF   r.   r/   rK   Úpickler*   rÑ   re   )Údatadirrà   ÚfnZ	fn_picklerß   Úfpr6   r;   r;   r<   Úget_lm_corpusâ  s2    





rò   )7r‰   ré   ÚloggingrG   rî   r(   Úcollectionsr   r   Útypingr   Znumpyr½   Z
tokenizersr   Ztokenizers.implementationsr   Ztokenizers.modelsr   Ztokenizers.normalizersr   r	   r
   r   Ztokenizers.pre_tokenizersr   r   Ztokenizers.processorsr   Z
file_utilsr   r   Ztokenization_utilsr   r   r.   Ú	getLoggerr†   rE   rd   rž   r‹   rŸ   r   rÔ   rÕ   r   r’   rœ   Úobjectr    rÄ   rÍ   rÑ   rò   r;   r;   r;   r<   Ú<module>   s`   


 ÿÿ ÿÿ ÿ ÿ  =>;M g