U
    &ºcœ  ã                   @   s2  d Z ddlZddlZddlmZ ddlm  mZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZ e e¡Zdd	iZd
d„ Zdd„ ZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ deƒZdZdZedeƒG dd„ deƒƒZ edeƒG d d!„ d!eƒƒZ!dS )"zÂ PyTorch Transformer XL model.
    Adapted from https://github.com/kimiyoung/transformer-xl.
    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
é    Né   )ÚTransfoXLConfig)Úadd_start_docstringsÚ add_start_docstrings_to_callable)ÚProjectedAdaptiveLogSoftmax)ÚPreTrainedModelztransfo-xl-wt103z=https://cdn.huggingface.co/transfo-xl-wt103-pytorch_model.binc                 C   s,  i }t | dƒr²| | jj| jjdœ¡ tt| jj| jj|j	ƒƒD ]j\}\}}}d| }|j
rr| |d |ji¡ n"t‚| |d |j|d |ji¡ |s@| |d |i¡ q@| j} tt| jj| jjƒƒD ]0\}\}}d| }| |d |j|d |i¡ qÈt| jƒD ]Æ\}}	d	| }| |d
 |	jjj|d |	jjj|d |	jjj|d |	jjj|d |	jjj|d |	jjj|d |	jjj|d |	jjd j|d |	jjd j|d |	jjd j|d |	jjd ji¡ q|jrg }
g }| jD ]"}	|
 |	jj¡ | |	jj¡ qân| jg}
| jg}| |
|dœ¡ |S )zš A map of modules from TF to PyTorch.
        This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
    Útransformer)z/transformer/adaptive_softmax/cutoff_0/cluster_Wz/transformer/adaptive_softmax/cutoff_0/cluster_bz'transformer/adaptive_softmax/cutoff_%d/ÚbZlookup_tableÚprojz%transformer/adaptive_embed/cutoff_%d/Zproj_Wztransformer/layer_%d/zrel_attn/LayerNorm/gammazrel_attn/LayerNorm/betazrel_attn/o/kernelzrel_attn/qkv/kernelzrel_attn/r/kernelzff/LayerNorm/gammazff/LayerNorm/betazff/layer_1/kernelr   zff/layer_1/biaszff/layer_2/kernelé   zff/layer_2/bias)ztransformer/r_r_biasztransformer/r_w_bias)ÚhasattrÚupdateÚcritÚcluster_weightÚcluster_biasÚ	enumerateÚzipÚ
out_layersÚ	out_projsÚ	tie_projsÚ
tie_weightÚbiasÚNotImplementedErrorÚweightr   Úword_embÚ
emb_layersÚ	emb_projsÚlayersÚdec_attnÚ
layer_normÚo_netÚqkv_netÚr_netÚpos_ffÚCoreNetÚuntie_rÚappendÚr_r_biasÚr_w_bias)ÚmodelÚconfigÚtf_to_pt_mapÚiZout_lZproj_lÚtie_projZ	layer_strZembed_lr	   Zr_r_listZr_w_list© r.   úD/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_transfo_xl.pyÚbuild_tf_to_pytorch_map)   st    
þÿÿ"           õÿ
r0   c                 C   sL  zddl }ddl}W n  tk
r4   t d¡ ‚ Y nX t| |ƒ}|j |¡}i }|D ]0\}}	t d 	||	¡¡ |j 
||¡}
|
||< qT| ¡ D ]œ\}}||ks¤t‚|| }
d|ks¼d|krÆ| |
¡}
d|ksØd|krt|ƒd	krt|ƒ|
jd ksüt‚t|ƒD ]ˆ\}}|
|d
f }z|j|jks,t‚W n< tk
rj } z| j|j|jf7  _‚ W 5 d}~X Y nX t d 	||¡¡ t |¡|_qnpz|j|
jks¤t‚W n< tk
râ } z| j|j|
jf7  _‚ W 5 d}~X Y nX t d 	|¡¡ t |
¡|_| |d¡ | |d d¡ | |d d¡ qŽt d 	d | ¡ ¡¡¡ | S )z, Load tf checkpoints in a pytorch model
    r   NzšLoading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z"Loading TF weight {} with shape {}Úkernelr
   r'   r(   r   .z)Initialize PyTorch weight {} for layer {}zInitialize PyTorch weight {}z/Adamz/Adam_1z'Weights not copied to PyTorch model: {}z, )ZnumpyZ
tensorflowÚImportErrorÚloggerÚerrorr0   ZtrainZlist_variablesÚinfoÚformatZload_variableÚitemsÚAssertionErrorÚ	transposeÚlenÚshaper   ÚargsÚtorchZ
from_numpyÚdataÚpopÚjoinÚkeys)r)   r*   Ztf_pathÚnpÚtfr+   Z	init_varsZ
tf_weightsÚnamer;   ÚarrayÚpointerr,   Zp_iZarr_iÚer.   r.   r/   Úload_tf_weights_in_transfo_xll   sV    ÿ


 rH   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚPositionalEmbeddingc                    s:   t ƒ  ¡  || _ddt d|d¡|   }|  d|¡ d S )Nr   i'  ç        g       @Úinv_freq)ÚsuperÚ__init__Údembr=   ÚarangeZregister_buffer)ÚselfrN   rK   ©Ú	__class__r.   r/   rM   §   s    
zPositionalEmbedding.__init__Nc                 C   sj   t  || j¡}t j| ¡ | ¡ gdd}|d k	rP|d d …d d d …f  d|d¡S |d d …d d d …f S d S )Néÿÿÿÿ©Údim)r=   ZgerrK   ÚcatÚsinÚcosÚexpand)rP   Úpos_seqÚbszZsinusoid_inpÚpos_embr.   r.   r/   Úforward¯   s
     zPositionalEmbedding.forward)N©Ú__name__Ú
__module__Ú__qualname__rM   r]   Ú__classcell__r.   r.   rQ   r/   rI   ¦   s   rI   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚPositionwiseFFFçñhãˆµøä>c              	      sn   t ƒ  ¡  || _|| _|| _t t ||¡tjddt 	|¡t ||¡t 	|¡¡| _
tj||d| _|| _d S )NT)Zinplace©Zeps)rL   rM   Úd_modelÚd_innerÚdropoutÚnnZ
SequentialÚLinearZReLUÚDropoutr$   Ú	LayerNormr   Ú	pre_lnorm)rP   rf   rg   rh   rm   Úlayer_norm_epsilonrQ   r.   r/   rM   º   s    



ûzPositionwiseFF.__init__c                 C   s<   | j r |  |  |¡¡}|| }n|  |¡}|  || ¡}|S ©N)rm   r$   r   )rP   ÚinpÚcore_outÚoutputr.   r.   r/   r]   Í   s    

zPositionwiseFF.forward)Frd   r^   r.   r.   rQ   r/   rc   ¹   s   rc   c                	       s0   e Zd Zd‡ fdd„	Zdd„ Zdd	d
„Z‡  ZS )Ú RelPartialLearnableMultiHeadAttnr   NFrd   c                    s  t ƒ  ¡  || _|| _|| _|| _|| _tj|d| | dd| _	t 
|¡| _t 
|¡| _tj|| |dd| _tj||d| _d|d  | _|	| _|
d ks¤|d krÖt t | j| j¡¡| _t t | j| j¡¡| _n|
| _|| _tj| j| j| j dd| _d S )Nr   F)r   re   r   ç      à?)rL   rM   Úoutput_attentionsÚn_headrf   Úd_headrh   ri   rj   r!   rk   ÚdropÚdropattr    rl   r   Úscalerm   Ú	Parameterr=   ÚFloatTensorr'   r(   r"   )rP   rv   rf   rw   rh   ry   Útgt_lenÚext_lenÚmem_lenrm   r'   r(   ru   rn   rQ   r.   r/   rM   ß   s&    
z)RelPartialLearnableMultiHeadAttn.__init__c                 C   sŒ   |  d¡df|  ¡ dd …  }tj||j|jd}tj||gdd}|  d¡d |  d¡f|  ¡ dd …  }|j|Ž }|dd …  |¡}|S )Nr   r   é   ©ÚdeviceÚdtyperT   )Úsizer=   Úzerosr‚   rƒ   rV   ÚviewZview_as)rP   ÚxZzero_pad_shapeZzero_padZx_paddedZx_padded_shaper.   r.   r/   Ú
_rel_shift  s    (
z+RelPartialLearnableMultiHeadAttn._rel_shiftc                 C   sF  |  d¡|  d¡|  d¡  }}}|d k	rŒt ||gd¡}	| jrR|  |  |	¡¡}
n
|  |	¡}
|  |¡}tj|
ddd\}}}|| d … }nB| jr¤|  |  |¡¡}
n
|  |¡}
|  |¡}tj|
ddd\}}}|  d¡}| ||| j	| j
¡}| ||| j	| j
¡}| ||| j	| j
¡}| || j	| j
¡}|| j }t d||f¡}|| j }t d||f¡}|  |¡}|| }| | j¡ |d k	r˜t |¡ ¡ r˜|dk}| ¡ dkrt|  ¡ ƒjtjkrê| ¡  |d d d …d d …d f d	¡ |¡}n*| ¡  |d d d …d d …d f d
¡ |¡}n‚| ¡ dkr˜t|  ¡ ƒjtjkrj| ¡  |d d …d d …d d …d f d	¡ |¡}n.| ¡  |d d …d d …d d …d f d
¡ |¡}tj|dd}|  |¡}|d k	rÂ|| }t d||f¡}| ¡  |  d¡|  d¡| j	| j
 ¡}|  |¡}|   |¡}| jr || g}n|  || ¡g}| j!rB| "|¡ |S )Nr   r   r   rS   rT   zibnd,jbnd->ijbnzibnd,jnd->ijbnr€   iÿÿgêŒ 9Y>)Æzijbn,jbnd->ibnd)#r„   r=   rV   rm   r!   r   r"   Úchunkr†   rv   rw   r(   Zeinsumr'   rˆ   Úmul_rz   ÚsumÚitemrU   ÚnextÚ
parametersrƒ   Zfloat16ÚfloatZmasked_fillZtype_asÚFZsoftmaxry   Ú
contiguousr    rx   ru   r&   )rP   ÚwÚrÚ	attn_maskÚmemsÚ	head_maskÚqlenZrlenr[   rV   Zw_headsZr_head_kZw_head_qZw_head_kZw_head_vÚklenZ	rw_head_qZACZ	rr_head_qZBDZ
attn_scoreZ	attn_probZattn_vecZattn_outÚoutputsr.   r.   r/   r]     sf    "







(ÿ,0.

&


z(RelPartialLearnableMultiHeadAttn.forward)	r   NNNFNNFrd   )NNN)r_   r`   ra   rM   rˆ   r]   rb   r.   r.   rQ   r/   rs   Þ   s            ò-rs   c                       s(   e Zd Zd‡ fdd„	Zddd„Z‡  ZS )	ÚRelPartialLearnableDecoderLayerrd   c                    sD   t ƒ  ¡  t||||fd|i|—Ž| _t|||| d¡|d| _d S )Nrn   rm   )rm   rn   )rL   rM   rs   r   rc   Úgetr#   )rP   rv   rf   rw   rg   rh   rn   ÚkwargsrQ   r.   r/   rM   r  s$    
   ÿÿÿ    ÿz(RelPartialLearnableDecoderLayer.__init__Nc           	      C   s8   | j |||||d}|  |d ¡}|g|dd …  }|S )N)r”   r•   r–   r   r   )r   r#   )	rP   Zdec_inpr“   Údec_attn_maskr•   r–   Zattn_outputsZ	ff_outputr™   r.   r.   r/   r]   |  s    z'RelPartialLearnableDecoderLayer.forward)rd   )NNNr^   r.   r.   rQ   r/   rš   q  s   
rš   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚAdaptiveEmbeddingr   Fc              	      s  t ƒ  ¡  || _|| _||g | _|| _|| _|d | _dg| j | _t	 
¡ | _t	 ¡ | _|dkr¢| j t	j|||dkd¡ ||kr | j t	 t ||¡¡¡ nltt| jƒƒD ]\}| j| | j|d   }}	|||  }
| j t	 |	| |
¡¡ | j t	 t ||
¡¡¡ q°d S )Nrt   r   r   )Úsparse)rL   rM   Ún_tokenÚd_embedÚcutoffsÚdiv_valÚd_projÚ	emb_scaleÚcutoff_endsri   Ú
ModuleListr   ZParameterListr   r&   Ú	Embeddingr{   r=   r|   Úranger:   )rP   r    r¡   r¤   r¢   r£   Úsample_softmaxr,   Úl_idxÚr_idxZd_emb_irQ   r.   r/   rM   ‡  s&    



zAdaptiveEmbedding.__init__c                 C   s.  | j dkr8| jd |ƒ}| j| jkr6t || jd ¡}næt|  ¡ ƒ}| 	d¡}t
j| d¡| jg|j|jd}tt| jƒƒD ]†}| j| | j|d   }}||k||k @ }	|	 ¡  ¡ }
|
 ¡ dkrÄq|| d|
¡| }| j| |ƒ}t || j| ¡}| d|
|¡ q|| ¡ | jf }| 	|¡}| | j¡ |S )Nr   r   rS   ©rƒ   r‚   )r£   r   r¤   r¡   r   Zlinearr   r   rŽ   r†   r=   r…   r„   rƒ   r‚   r©   r:   r¢   r¦   ZnonzeroZsqueezeZnumelZindex_selectZindex_copy_rŠ   r¥   )rP   rp   ÚembedÚparamZinp_flatZemb_flatr,   r«   r¬   Zmask_iZ	indices_iZinp_iZemb_iZembed_shaper.   r.   r/   r]   ¢  s*    

 
zAdaptiveEmbedding.forward)r   Fr^   r.   r.   rQ   r/   rž   †  s   rž   c                   @   s8   e Zd ZdZeZeZeZ	dZ
dd„ Zdd„ Zdd„ Zd	S )
ÚTransfoXLPreTrainedModelz† An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    r   c                 C   sL   | j jdkr(tj || j j | j j¡ n | j jdkrHtj |d| j j¡ d S )NÚuniformÚnormalrJ   )r*   Úinitri   Zuniform_Z
init_rangeÚnormal_Úinit_std)rP   r   r.   r.   r/   Ú_init_weightÌ  s    z%TransfoXLPreTrainedModel._init_weightc                 C   s   t j |d¡ d S )NrJ   )ri   r³   Z	constant_)rP   r   r.   r.   r/   Ú
_init_biasÒ  s    z#TransfoXLPreTrainedModel._init_biasc                 C   s<  |j j}| d¡dkrZt|dƒr6|jdk	r6|  |j¡ t|dƒrV|jdk	rV|  |j¡ nÞ| d¡dkr²t|dƒr®tt	|j
ƒƒD ],}|j
| dk	r€tj |j
| d| jj¡ q€n†| d	¡dkrÚt|dƒrÖ|  |j¡ n^| d
¡dkr€t|dƒr|jdk	r|  |j¡ t|dƒr2|jdk	r2|  |j¡ t|dƒr8tt	|jƒƒD ]0}|j| dk	rLtj |j| d| jj¡ qLn¸| d¡dkrØt|dƒr²tj |jd| jj¡ t|dƒr8|jdk	r8|  |j¡ n`t|dƒrð|  |j¡ t|dƒr|  |j¡ t|dƒr |  |j¡ t|dƒr8|  |j¡ dS )z! Initialize the weights.
        rj   rS   r   Nr   rž   r   rJ   r¨   r   r   r   r   rl   g      ð?Úr_embr(   r'   Úr_bias)rR   r_   Úfindr   r   r¶   r   r·   r©   r:   r   ri   r³   r´   r*   Zproj_init_stdr   r   r   rµ   r¸   r(   r'   r¹   )rP   ÚmÚ	classnamer,   r.   r.   r/   Ú_init_weightsÕ  sH    
 
 z&TransfoXLPreTrainedModel._init_weightsN)r_   r`   ra   Ú__doc__r   Zconfig_classÚ'TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_maprH   Zload_tf_weightsZbase_model_prefixr¶   r·   r½   r.   r.   r.   r/   r°   Â  s   r°   as  

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
            given to this model should not be passed as input ids as they have already been computed.
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
z^The bare Bert Model transformer outputting raw hidden-states without any specific head on top.c                       sf   e Zd Z‡ fdd„Zdd„ Zdd„ Zdd„ Zd	d
„ Zdd„ Zdd„ Z	dd„ Z
eeƒddd„ƒZ‡  ZS )ÚTransfoXLModelc                    sª  t ƒ  |¡ |j| _|j| _|j| _|j| _|j| _|j| _|j	| _	t
|j|j|j|j|jd| _t |j¡| _|j| _|j| _|j| _|j| _|j|j |j | _|j| _|jsât t | j| j	¡¡| _t t | j| j	¡¡| _t ¡ | _|jdkrlt |jƒD ]f}| j !t"|j|j|j	|j#|j|j|j|j|j$|j%|jrBd n| j|jrRd n| j| j|j&d¡ qnt'‚|j(| _(|j)| _)| jdkršt*| jƒ| _+nt'‚|  ,¡  d S )N©r£   r   )	r}   r~   r   ry   rm   r(   r'   ru   rn   )-rL   rM   ru   Úoutput_hidden_statesÚ
vocab_sizer    r¡   rf   rv   rw   rž   r¢   r£   r   ri   rk   rh   rx   Ún_layerr}   r   r~   Zmax_klenÚ	attn_typer%   r{   r=   r|   r(   r'   r§   r   r©   r&   rš   rg   ry   rm   rn   r   Úsame_lengthÚ	clamp_lenrI   r\   Úinit_weights)rP   r*   r,   rQ   r.   r/   rM   )  sj        ÿ
òÿ
zTransfoXLModel.__init__c                 C   s   | j S ro   ©r   ©rP   r.   r.   r/   Úget_input_embeddingsj  s    z#TransfoXLModel.get_input_embeddingsc                 C   s
   || _ d S ro   rÉ   )rP   Znew_embeddingsr.   r.   r/   Úset_input_embeddingsm  s    z#TransfoXLModel.set_input_embeddingsc                 C   s
   d| _ d S )NrS   )rª   rÊ   r.   r.   r/   Úbackward_compatiblep  s    z"TransfoXLModel.backward_compatiblec                 C   s   || _ || _|| _d S ro   )r}   r   r~   ©rP   r}   r~   r   r.   r.   r/   Úreset_lengths  s    zTransfoXLModel.reset_lengthc                 C   s   t  d¡ d S )Nz8Head pruning is not implemented for Transformer-XL model)r3   r5   )rP   Zheadsr.   r.   r/   Ú_prune_headsx  s    
zTransfoXLModel._prune_headsc                 C   s^   | j dkrVg }t|  ¡ ƒ}t| jƒD ],}tj| j || jj|j	|j
d}| |¡ q$|S d S d S )Nr   r­   )r   r   rŽ   r©   rÄ   r=   r…   r*   rf   rƒ   r‚   r&   )rP   r[   r•   r¯   r,   Úemptyr.   r.   r/   Ú	init_mems|  s    
zTransfoXLModel.init_memsc           
   	   C   sª   |d krd S t |ƒt |ƒks$tdƒ‚t ¡ t g }|td|d | j ƒ }td|| j ƒ}tt |ƒƒD ]4}tj|| || gdd}	| 	|	||…  
¡ ¡ qfW 5 Q R X |S )Nzlen(hids) != len(mems)r   rT   )r:   r8   r=   Zno_gradÚmaxr~   r   r©   rV   r&   Údetach)
rP   Úhidsr•   Úmlenr—   Únew_memsZend_idxZbeg_idxr,   rV   r.   r.   r/   Ú_update_memsˆ  s    
"zTransfoXLModel._update_memsNc                 C   sV  |dk	r|dk	rt dƒ‚n^|dk	r@| dd¡ ¡ }| ¡ \}}n8|dk	rp| dd¡ ¡ }|jd |jd  }}nt dƒ‚|dkrŠ|  |¡}|dk	r| ¡ dkrÒ| d¡ d¡ d¡ d¡}| | j	dddd¡}n"| ¡ dkrô| d¡ d¡ d¡}|j
t|  ¡ ƒjd}ndg| j	 }|dk	r(|}n
|  |¡}|dk	rJ|d  d¡nd}|| }	| jrÆ|j||	ftjd}
|	| j }|dkr|| }n|}t |
d| ¡t |
| ¡ dd…dd…df }n4tj|j||	ftjdd| d	dd…dd…df }g }g }| jdkrÎtj|	d dd
|j|jd}| jdkrD|j| jd |  |¡}|  |¡}|  |¡}t| jƒD ]^\}}| |¡ |dkrŒdn|| }||||||| d}|d }| j rl| |d ¡ qlnt!‚|  |¡}|  "||||¡}| dd¡ ¡ |g}| j#r.| |¡ t$dd„ |D ƒƒ}| |¡ | j rRt$dd„ |D ƒƒ}| |¡ |S )aë  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import TransfoXLTokenizer, TransfoXLModel
        import torch

        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states, mems = outputs[:2]

        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   r   ú5You have to specify either input_ids or inputs_embedsrS   r€   )rƒ   )Zdiagonalg      ð¿r   )rÓ   )r   r•   r–   c                 s   s   | ]}|  d d¡ ¡ V  qdS )r   r   N)r9   r‘   ©Ú.0Útr.   r.   r/   Ú	<genexpr>  s     z)TransfoXLModel.forward.<locals>.<genexpr>c                 s   s"   | ]}|  d ddd¡ ¡ V  qdS )r€   r   r   r   N)Zpermuter‘   rÚ   r.   r.   r/   rÝ     s     )%Ú
ValueErrorr9   r‘   r„   r;   rÒ   rU   Z	unsqueezerY   rÄ   Útor   rŽ   rƒ   r   rÆ   Znew_onesr=   Zuint8r   ZtriuZtrilrÅ   rO   r‚   rÇ   Zclamp_r\   rx   r   r   r&   ru   r   rØ   rÂ   Úlist)rP   Ú	input_idsr•   r–   Úinputs_embedsr—   r[   r   rÖ   r˜   Úall_onesZmask_lenZmask_shift_lenr   rÕ   Z
attentionsrZ   r\   rq   r,   ZlayerZmems_iZlayer_outputsr×   r™   r.   r.   r/   r]      sŒ    %


ÿ




2,ÿ



    ÿ



zTransfoXLModel.forward)NNNN)r_   r`   ra   rM   rË   rÌ   rÍ   rÏ   rÐ   rÒ   rØ   r   ÚTRANSFO_XL_INPUTS_DOCSTRINGr]   rb   r.   r.   rQ   r/   rÀ   $  s   ArÀ   z‡The Transformer-XL Model with a language modeling head on top
    (adaptive softmax with weights tied to the adaptive input embeddings)c                       sV   e Zd Z‡ fdd„Zdd„ Zdd„ Zdd„ Zeeƒdd
d„ƒZ	dd„ Z
dd„ Z‡  ZS )ÚTransfoXLLMHeadModelc                    sZ   t ƒ  |¡ t|ƒ| _|j| _| jdks0tdƒ‚t|j|j|j	|j
|jd| _|  ¡  d S )Nr   z†Sampling from the softmax is not implemented yet. Please look at issue: #3310: https://github.com/huggingface/transformers/issues/3310rÁ   )rL   rM   rÀ   r   rª   r8   r   rÃ   r¡   rf   r¢   r£   r   rÈ   )rP   r*   rQ   r.   r/   rM   (  s    
ÿþ    ÿzTransfoXLLMHeadModel.__init__c                 C   s  | j jr<tt| jjƒƒD ]"}|  | jj| | jjj	| ¡ q| j j
rt| j j
ƒD ]º\}}|r¼| j jdkr¼| j j| j jkr¼| j jr¤t | jjjd  ¡ ¡| jj|< n| jjjd | jj|< qR|rR| j jdkrR| j jröt | jjj|  ¡ ¡| jj|< qR| jjj| | jj|< qRdS )zZ
        Run this to be sure output and input (adaptive) softmax weights are tied
        r   r   N)r*   r   r©   r:   r   r   Z_tie_or_clone_weightsr   r   r   r   r   r£   rf   r¡   Ztorchscriptri   r{   r   Úcloner   )rP   r,   r-   r.   r.   r/   Útie_weights7  s     
 ""z TransfoXLLMHeadModel.tie_weightsc                 C   s   | j  |||¡ d S ro   )r   rÏ   rÎ   r.   r.   r/   rÏ   L  s    z!TransfoXLLMHeadModel.reset_lengthc                 C   s   | j  |¡S ro   )r   rÒ   )rP   r[   r.   r.   r/   rÒ   O  s    zTransfoXLLMHeadModel.init_memsNc                 C   sÒ   |dk	r |  d¡|  d¡ }}n(|dk	r@|  d¡|  d¡ }}ntdƒ‚| j||||d}|d }	|	dd…| d…f }
|dd… }|  |
|¡}|dkr²| ||d¡}|g| }n| ||d ¡}|dg| }|S )a¿
  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored (masked), the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape `(batch_size, sequence_length-1)`, `optional`, returned when ``labels`` is provided)
            Language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
        import torch

        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        prediction_scores, mems = outputs[:2]

        Nr   r   rÙ   )r•   r–   râ   rS   )r„   rÞ   r   r   r†   )rP   rá   r•   r–   râ   Úlabelsr[   r}   Ztransformer_outputsZlast_hiddenZpred_hidr™   Zsoftmax_outputr.   r.   r/   r]   R  s     ,zTransfoXLLMHeadModel.forwardc                 C   s    | j dkr| jS | jjd S dS )z9 Double-check if you are using adaptive softmax.
        r   rS   N)rª   Z	out_layerr   r   rÊ   r.   r.   r/   Úget_output_embeddings•  s    
z*TransfoXLLMHeadModel.get_output_embeddingsc                 K   s   d|i}|r||d< |S )Nrá   r•   r.   )rP   rá   ZpastZmodel_kwargsÚinputsr.   r.   r/   Úprepare_inputs_for_generation  s    z2TransfoXLLMHeadModel.prepare_inputs_for_generation)NNNNN)r_   r`   ra   rM   rç   rÏ   rÒ   r   rä   r]   ré   rë   rb   r.   r.   rQ   r/   rå   "  s   Brå   )"r¾   Úloggingr=   Ztorch.nnri   Ztorch.nn.functionalZ
functionalr   Zconfiguration_transfo_xlr   Z
file_utilsr   r   Zmodeling_transfo_xl_utilitiesr   Zmodeling_utilsr   Ú	getLoggerr_   r3   r¿   r0   rH   ÚModulerI   rc   rs   rš   rž   r°   ZTRANSFO_XL_START_DOCSTRINGrä   rÀ   rå   r.   r.   r.   r/   Ú<module>   sD   
 ÿC:% <=þ {ý