U
    &c                     @   s2   d Z ddlZddlmZ G dd dejjjZdS )z5 A TF 2.0 Adaptive Softmax for Transformer XL model.
    N   )
shape_listc                       sN   e Zd Zd fdd	Z fddZeddd	Zed
d ZdddZ  Z	S )TFAdaptiveSoftmaxMaskr   Fc                    s   t  jf | || _|| _|| _||g | _dg| j | _|| _| jd | _t	| jd | _
| j| j
 | _|| _g | _g | _d S Nr   r   )super__init__
vocab_sized_embedd_projcutoffscutoff_endsdiv_valZshortlist_sizelen
n_clustersZ	head_size
keep_order
out_layers	out_projs)selfr   r	   r
   r   r   r   kwargs	__class__ Q/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_transfo_xl_utilities.pyr      s    zTFAdaptiveSoftmaxMask.__init__c                    s  | j dkr>| j| j | jfdddd| _| j| j fdddd| _| jdkrtt| jD ]}| j	| jkr| j| j| j	fddd
|d}| j| n| jd  | j| j| jfddd	
|d}| j| jfddd

|d}| j||f qVntt| jD ]}| j| | j|d   }}| j| j|  }| j|| j	fddd
|d}| j| | j|| |fddd	
|d}| j|| fddd

|d}| j||f qt | d S )Nr   zerosTcluster_weight)shapeZinitializerZ	trainablenamecluster_biasr   zout_projs_._{}zout_layers_._{}_._weightzout_layers_._{}_._bias)r   Z
add_weightr	   r   r   r   ranger   r   r
   formatr   appendr   r   r   r   build)r   Zinput_shapeiZweightZbiasl_idxr_idxZd_emb_ir   r   r   r!   -   sz    

      


   
zTFAdaptiveSoftmaxMask.buildNc                 C   s,   | }|d k	rt d||}t d||| S )Nzibd,ed->ibezibd,nd->ibn)tfZeinsum)xWbZprojyr   r   r   _logitg   s    zTFAdaptiveSoftmaxMask._logitc                 C   s2   t | }t|d }t||gd}t| |S r   )r   r%   r   stackZ	gather_nd)ZlogprobtargetZlp_sizeridxr   r   r   _gather_logprobn   s    z%TFAdaptiveSoftmaxMask._gather_logprobTc              
   C   s  |\}}d}| j dkrl| || jd d | jd d | jd }|d k	rXtjj||d}tjj|dd}	nFt|}
g }	tj	|
d d tj
d}tt| jD ]}| j| | j|d   }}|d k	r||k||k @ }t|}t||| }| jdkr*| jd d || }| jd d || }n| j| d }| j| d }|dkrt|| jgd}t|| jgd}| |||| jd }tj|}|	|dd | jd f  |d k	rvt||}| ||}n| |||| j| }tj|}| jd | d }|d|d f | }|	| |d k	rvt||}t||}| ||}||d d | jd | d f 7 }|d k	r|t|| tjt|tjd7 }qtj|	dd}	|d k	r|rt|}| | | j|| j|rd	nd
d |	S )Nr   r   )labelsZlogits)Zaxis   )Zdtype.Zmean )r   Zaggregation)r   r*   r   r   r%   nnZ(sparse_softmax_cross_entropy_with_logitsZlog_softmaxr   r   Zfloat32r   r   r   r   whereZboolean_maskr   concatr   r   r    r/   Z
scatter_ndcastZint64Zreduce_meanZadd_lossZ
add_metricr   )r   inputsZreturn_meanZtraininghiddenr,   Zhead_logproboutputZlossoutZhidden_sizesr"   r#   r$   maskZmask_idx
cur_targetZcur_WZcur_bZ
head_logitZcur_head_logprobZcur_logprobZ
tail_logitZtail_logprobZcluster_prob_idxZ	logprob_iZcur_tail_logprobr   r   r   callu   sb    
*




"&


zTFAdaptiveSoftmaxMask.call)r   F)N)TF)
__name__
__module____qualname__r   r!   staticmethodr*   r/   r>   __classcell__r   r   r   r   r      s   :
r   )	__doc__Z
tensorflowr%   Zmodeling_tf_utilsr   ZkerasZlayersZLayerr   r   r   r   r   <module>   s   