U
    &ºcÉ)  ã                   @   s@   d Z ddlZddlmZ ddlm  mZ G dd„ dejƒZdS )zt Utilities for PyTorch Transformer XL model.
    Directly adapted from https://github.com/kimiyoung/transformer-xl.
é    Nc                       s8   e Zd Zd‡ fdd„	Zdd„ Zddd	„Zd
d„ Z‡  ZS )ÚProjectedAdaptiveLogSoftmaxé   Fc              	      sŒ  t ƒ  ¡  || _|| _|| _||g | _dg| j | _|| _| jd | _t	| jƒd | _
| j| j
 | _| j
dkrœt t | j
| j¡¡| _t t | j
¡¡| _t ¡ | _t ¡ | _|dkrtt	| jƒƒD ]4}||krð| j t t ||¡¡¡ qÈ| j d ¡ qÈ| j t ||¡¡ nntt	| jƒƒD ]^}| j| | j|d   }}	|||  }
| j t t ||
¡¡¡ | j t |
|	| ¡¡ q"|| _d S )Nr   r   )ÚsuperÚ__init__Ún_tokenÚd_embedÚd_projÚcutoffsÚcutoff_endsÚdiv_valZshortlist_sizeÚlenÚ
n_clustersZ	head_sizeÚnnÚ	ParameterÚtorchÚzerosÚcluster_weightÚcluster_biasZ
ModuleListÚ
out_layersZParameterListÚ	out_projsÚrangeÚappendZFloatTensorZLinearÚ
keep_order)Úselfr   r   r   r	   r   r   ÚiÚl_idxÚr_idxZd_emb_i©Ú	__class__© úN/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_transfo_xl_utilities.pyr      s6    




z$ProjectedAdaptiveLogSoftmax.__init__c                 C   sB   |d krt j|||d}n$t  || ¡  ¡ ¡}t j|||d}|S )N)Úbias)ÚFZlinearÚtÚ
contiguous)r   ÚhiddenÚweightr!   ZprojÚlogitZproj_hidr   r   r    Ú_compute_logitH   s
    z*ProjectedAdaptiveLogSoftmax._compute_logitNc                 C   sV  |dk	rp|ddd…dd…f   ¡ }|ddd…f   ¡ }| d| d¡¡}| d¡}| d¡| d¡kr‚tdƒ‚n| d| d¡¡}| jdkrò|  || jd j| jd j| j	d ¡}|dk	ràt
j|dd d| d¡¡ d¡ }nt
j|dd}n`g g  }}tt| jƒƒD ]´}| jdkr^| j| | j|d   }	}
| jd j|	|
… }| jd j|	|
… }n| j| j}| j| j}|dkr¨tj|| jgdd}tj|| jgdd}| |¡ | |¡ q
|d |d | j	d   }}}|  ||||¡}t
j|dd}|dkr| | d¡| jf¡}ntj||j|jd}d}dg| j }tt|ƒd ƒD ]ü}|| ||d   }	}
|dk	rÐ||	k||
k @ }| ¡  ¡ }| ¡ dkr¦qR|  d|¡|	 }|  d|¡}|  d|¡}n|}|dkr:|dk	r| d|dd…df ¡ d¡}n0|dd…d| jd …f |dd…d| jd …f< n²|| || | j	|   }}}|  ||||¡}t
j|dd}| jd | d }|dk	rÂ|dd…|f | d|dd…df ¡ d¡ }n*|dd…|df | }||dd…|	|
…f< |dk	rRt!| d	ƒr
| j"s|r"| #d|| ¡ n|||| d¡ …  $| ¡ || d¡7 }qR|S )
a€  
            Params:
                hidden :: [len*bsz x d_proj]
                labels :: [len*bsz]
            Return:
                if labels is None:
                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
                else:
                    out :: [(len-1)*bsz] Negative log likelihood
            We could replace this implementation by the native PyTorch one
            if their's had an option to set bias on all clusters in the native one.
            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
        N.éÿÿÿÿr   r   zBInput and labels should have the same size in the batch dimension.©Zdim)ÚdtypeÚdevicer   )%r$   ÚviewÚsizeÚRuntimeErrorr   r(   r   r&   r!   r   r"   Úlog_softmaxZgatherZ	unsqueezeZsqueezer   r   r	   r   r
   r   Úcatr   r   r   Ú	new_emptyr   Z
zeros_liker+   r,   ZnonzeroZnumelZindex_selectÚhasattrr   Zindex_copy_Zcopy_)r   r%   Úlabelsr   r'   ÚoutÚweightsÚbiasesr   r   r   Úweight_iÚbias_iÚhead_weightÚ	head_biasÚ	head_projÚ
head_logitÚhead_logprobÚoffsetÚcutoff_valuesZmask_iZ	indices_iZtarget_iZhead_logprob_iZhidden_iÚ	logprob_iÚproj_iÚtail_logit_iÚtail_logprob_iZcluster_prob_idxr   r   r    ÚforwardV   sˆ    


&&






 2
 ÿþ
z#ProjectedAdaptiveLogSoftmax.forwardc                 C   s:  | j dkr>|  || jd j| jd j| jd ¡}tj|ddS g g  }}tt	| j
ƒƒD ]®}| jdkr¨| j| | j|d   }}| jd j||… }| jd j||… }	n| j| j}| j| j}	|dkrðtj|| jgdd}tj|	| jgdd}	| |¡ | |	¡ qV|d |d | jd   }
}}|  ||
||¡}| | d¡| jf¡}tj|dd}dg| j
 }tt	|ƒd ƒD ]¼}|| ||d   }}|dkrÊ|dd…d| j
d …f |dd…d| j
d …f< nd|| || | j|   }}	}|  |||	|¡}tj|dd}|dd…| f | }||dd…||f< qt|S dS )a0   Computes log probabilities for all :math:`n\_classes`
        From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
        Args:
            hidden (Tensor): a minibatch of examples
        Returns:
            log-probabilities of for each class :math:`c`
            in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
        Shape:
            - Input: :math:`(N, in\_features)`
            - Output: :math:`(N, n\_classes)`
        r   r)   r*   r   N)r   r(   r   r&   r!   r   r"   r0   r   r   r	   r   r
   r   r1   r   r   r   r2   r.   r   )r   r%   r'   r6   r7   r   r   r   r8   r9   r:   r;   r<   r=   r5   r>   r@   Z	start_idxZstop_idxrB   rC   rD   rA   r   r   r    Úlog_probÁ   s>    
&



2z$ProjectedAdaptiveLogSoftmax.log_prob)r   F)NF)Ú__name__Ú
__module__Ú__qualname__r   r(   rE   rF   Ú__classcell__r   r   r   r    r      s   )
kr   )	Ú__doc__r   Ztorch.nnr   Ztorch.nn.functionalZ
functionalr"   ÚModuler   r   r   r   r    Ú<module>   s   