U
    &c                     @   s  d Z ddlZddlZddlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ eeZdd	d
Zdd Zdd ZejjeejjjejjedZG dd dejjj Z!G dd dejjj Z"G dd dejjj Z#G dd dejjj Z$eG dd dejjj Z%G dd deZ&dZ'dZ(e	de'G dd  d e&Z)e	d!e'G d"d# d#e&Z*e	d$e'G d%d& d&e&Z+e	d'e'G d(d) d)e&Z,e	d*e'G d+d, d,e&Z-dS )-z TF 2.0 XLNet model.
    N   )XLNetConfig)add_start_docstrings add_start_docstrings_to_callable)TFPreTrainedModelTFSequenceSummaryTFSharedEmbeddingsget_initializerkeras_serializable
shape_list)BatchEncodingz7https://cdn.huggingface.co/xlnet-base-cased-tf_model.h5z8https://cdn.huggingface.co/xlnet-large-cased-tf_model.h5)zxlnet-base-casedzxlnet-large-casedc                 C   s:   ddt tdtj | dt | d     }| | S )z Implementation of the gelu activation function.
        XLNet is using OpenAI GPT's gelu
        Also see https://arxiv.org/abs/1606.08415
          ?      ?   gHm?   )tftanhnpsqrtpipow)xZcdf r   B/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_xlnet.pygelu.   s    2r   c                 C   s   | t |  S N)r   Zsigmoid)r   r   r   r   swish7   s    r   )r   relur   c                       sX   e Zd Z fddZ fddZdd Zddd	ZdddZdddZdddZ	  Z
S )TFXLNetRelativeAttentionc                    s   t  jf | |j| _|j|j dkr:td|j|jf |j| _|j| _|j| _d|jd  | _|j| _t	j
jj|jdd| _t	j
j|j| _d S )Nr   zLThe hidden size (%d) is not a multiple of the number of attention heads (%d)r   r   
layer_normepsilonname)super__init__output_attentionsd_modeln_head
ValueErrord_headscaleinitializer_ranger   keraslayersLayerNormalizationlayer_norm_epsr   Dropoutdropoutselfconfigkwargs	__class__r   r   r$   C   s    
z!TFXLNetRelativeAttention.__init__c                    s,  t | j}| j| j| j| jf|ddd| _| j| j| j| jf|ddd| _| j| j| j| jf|ddd| _| j| j| j| jf|ddd| _	| j| j| j| jf|ddd| _
| j| j| jfddd	d| _| j| j| jfddd
d| _| j| j| jfdddd| _| jd| j| jf|ddd| _t | d S )NTqshapeinitializerZ	trainabler"   kvorzerosr_r_biasr_s_biasr_w_biasr   	seg_embed)r	   r+   
add_weightr&   r'   r)   r8   r<   r=   r>   r?   rA   rB   rC   rD   r#   buildr3   input_shaper;   r6   r   r   rF   V   sp    
               
   
   
      zTFXLNetRelativeAttention.buildc                 C   s   t d S r   NotImplementedError)r3   Zheadsr   r   r   prune_headsu   s    z$TFXLNetRelativeAttention.prune_headsc                 C   s   t |}t||d |d |d |d f}|dddf }t||d |d d |d |d f}|ddd|ddddf }|S )z<perform relative shift to form the relative attention score.r   r   r   r   N.)r   r   Zreshape)r3   r   klenZx_sizer   r   r   	rel_shiftx   s    $( z"TFXLNetRelativeAttention.rel_shiftFc                 C   s  |\}}}}}}}	t d|| j |}
t d|| j |}| j|t|
d d}|dkr^d}n$t d|| j | j}t d||}|
| | | j }|dk	r|j	t j
kr|d|  }n|d	|  }t jj|dd
}| j||d}|	dk	r||	 }t d||}| jr||fS |S )z.Core relative positional attention operations.zibnd,jbnd->ijbnr   )rM   Nr   zibnd,snd->ibnszijbs,ibns->ijbni  gꌠ9Y>)FZaxistrainingzijbn,jbnd->ibnd)r   einsumrC   rA   rN   r   rB   rD   r*   dtypeZfloat16nnZsoftmaxr1   r%   )r3   inputsrQ   Zq_headk_head_hv_head_hk_head_rseg_mat	attn_mask	head_maskacZbdZefZ
attn_score	attn_probattn_vecr   r   r   rel_attn_core   s*    z&TFXLNetRelativeAttention.rel_attn_coreTc                 C   s@   |\}}t d|| j}| j||d}|r2|| }| |}|S )zPost-attention processing.zibnd,hnd->ibhrP   )r   rR   r>   r1   r   )r3   rU   ZresidualrQ   hr^   Zattn_outoutputr   r   r   post_attention   s    
z'TFXLNetRelativeAttention.post_attentionc              	   C   sT  |\	}}}}}}}	}
}|d k	r|	d k	rLt t|	dkrLtj|	|gdd}n|}td|| j}td|| j}td|| j}td|| j}| j	|||||||g|d}| j
r|\}}| j||g|d}td|| j}|
d k	r0td||
}| j	|||||||g|d}| j
r |\}}td||
}n,| j	|||||||g|d}| j
r\|\}}| j||g|d}| j
r6||f}n|	d k	rt t|	dkrtj|	|gdd}n|}td|| j}td|| j}td|| j}td|| j}| j	|||||||g|d}| j
r |\}}| j||g|d}d }||f}| j
rP||f }|S )Nr   r   rO   zibh,hnd->ibndrP   zmbnd,mlb->lbndzlbnd,mlb->mbnd)lenr   r   concatrR   r<   r=   r?   r8   r_   r%   rb   )r3   rU   rQ   r`   gZattn_mask_hZattn_mask_gr?   rY   memstarget_mappingr[   catrV   rW   rX   Zq_head_hZ
attn_vec_hZattn_prob_houtput_hZq_head_gZ
attn_vec_gZattn_prob_goutput_gr]   r^   outputsr   r   r   call   sl    
 
  
 
zTFXLNetRelativeAttention.call)rL   )F)TF)F)__name__
__module____qualname__r$   rF   rK   rN   r_   rb   rl   __classcell__r   r   r6   r   r   B   s   

-
r   c                       s&   e Zd Z fddZdddZ  ZS )TFXLNetFeedForwardc                    s   t  jf | tjjj|jdd| _tjjj|j	t
|jdd| _tjjj|jt
|jdd| _tjj|j| _t|jtrt|j | _n|j| _d S )Nr   r    layer_1Zkernel_initializerr"   layer_2)r#   r$   r   r,   r-   r.   r/   r   DenseZd_innerr	   r+   rr   r&   rt   r0   r1   
isinstanceZff_activationstrACT2FNactivation_functionr2   r6   r   r   r$   "  s         zTFXLNetFeedForward.__init__Fc                 C   sP   |}|  |}| |}| j||d}| |}| j||d}| || }|S )NrP   )rr   ry   r1   rt   r   )r3   inprQ   ra   r   r   r   rl   1  s    


zTFXLNetFeedForward.call)Frm   rn   ro   r$   rl   rp   r   r   r6   r   rq   !  s   rq   c                       s&   e Zd Z fddZdddZ  ZS )TFXLNetLayerc                    s@   t  jf | t|dd| _t|dd| _tjj	|j
| _
d S )Nrel_attnr"   ff)r#   r$   r   r}   rq   r   r   r,   r-   r0   r1   r2   r6   r   r   r$   =  s    zTFXLNetLayer.__init__Fc                 C   sZ   | j ||d}|d d \}}|d k	r4| j||d}| j||d}||f|dd   }|S )NrP   r   )r}   r   )r3   rU   rQ   rk   ri   rj   r   r   r   rl   C  s    zTFXLNetLayer.call)Fr{   r   r   r6   r   r|   <  s   r|   c                       s0   e Zd Z fddZ fddZdd Z  ZS )TFXLNetLMHeadc                    s    t  jf | |j| _|| _d S r   )r#   r$   
vocab_sizeinput_embeddings)r3   r4   r   r5   r6   r   r   r$   P  s    zTFXLNetLMHead.__init__c                    s(   | j | jfdddd| _t | d S )Nr@   Tbiasr9   )rE   r   r   r#   rF   )r3   rH   r6   r   r   rF   W  s    zTFXLNetLMHead.buildc                 C   s   | j |dd}|| j }|S )NZlinear)mode)r   r   )r3   hidden_statesr   r   r   rl   [  s    
zTFXLNetLMHead.call)rm   rn   ro   r$   rF   rl   rp   r   r   r6   r   r   O  s   r   c                
       sx   e Zd ZeZ fddZdd Zdd Zdd Zd	d
 Z	e
jfddZdd ZedddZdddZdddZ  ZS )TFXLNetMainLayerc                    s   t  jf |  j| _ j| _ j| _ j| _ j| _ j| _ j| _ j	| _	 j
| _
 j| _ j| _ j| _t j j jdd| _ fddt jD | _tjj j| _d S )Nword_embeddingr+   r"   c                    s   g | ]}t  d |dqS )z
layer_._{}r~   )r|   format).0ir4   r   r   
<listcomp>x  s     z-TFXLNetMainLayer.__init__.<locals>.<listcomp>)r#   r$   r%   output_hidden_statesmem_len	reuse_lenr&   same_length	attn_typebi_data	clamp_lenn_layeruse_bfloat16r+   r   r   r   rangelayerr   r,   r-   r0   r1   r2   r6   r   r   r$   e  s*       zTFXLNetMainLayer.__init__c                 C   s   | j S r   )r   r3   r   r   r   get_input_embeddings{  s    z%TFXLNetMainLayer.get_input_embeddingsc                 C   s*   t | j}| jdd| jf|ddd| _d S )Nr   Tmask_embr9   )r	   r+   rE   r&   r   rG   r   r   r   rF   ~  s    

   zTFXLNetMainLayer.buildc                 C   s   t d S r   rI   )r3   Znew_num_tokensr   r   r   _resize_token_embeddings  s    z)TFXLNetMainLayer._resize_token_embeddingsc                 C   s   t d S r   rI   )r3   Zheads_to_pruner   r   r   _prune_heads  s    zTFXLNetMainLayer._prune_headsc           
      C   s   t j||g|d}t |dd}t |dd}t j||g|d}t ||| gd}| jrt |dd}	t |ddd|f |	 | |dd|df gd}|S )a  
        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.

        Args:
            qlen: TODO Lysandre didn't fill
            mlen: TODO Lysandre didn't fill

        ::

                  same_length=False:      same_length=True:
                  <mlen > <  qlen >       <mlen > <  qlen >
               ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
                 [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
            qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
                 [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
               v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]

        rS   r   rL   r   N)r   onesZmatrix_band_partr@   rd   r   )
r3   qlenmlenrS   rZ   Zmask_uZmask_diaZattn_mask_padretZmask_lr   r   r   create_mask  s    8zTFXLNetMainLayer.create_maskc                 C   sb   | j dk	r"| j dkr"|d| j  }|dkr<|| j d }nt||gd| j d }t|S )z cache hidden states into memory.Nr   )r   r   r   rd   Zstop_gradient)r3   Zcurr_outZprev_memZnew_memr   r   r   	cache_mem  s    zTFXLNetMainLayer.cache_memNc                 C   s`   t d| |}t jt |t |gdd}|d d d d d f }|d k	r\t |d|dg}|S )Nzi,d->idrL   rO   r   )r   rR   rd   sincostile)Zpos_seqinv_freqbszZsinusoid_inppos_embr   r   r   positional_embedding  s    z%TFXLNetMainLayer.positional_embeddingc                 C   s  t d| jd}|dk	r0|t jkr0t j||d}dd|| j   }| jdkrZ||  }}n&| jdkrp|d	 }}ntd
| j| jrxt ||d}	t | | d}
|dk	r|t jkrt j|	|d}	t j|
|d}
| j	dkr
t 
|	| j	 | j	}	t 
|
| j	 | j	}
|dk	rL|d dks&t| |	||d }| |
||d }n| |	|}| |
|}t j||gdd}n`t ||d}	|dk	r|t jkrt j|	|d}	| j	dkrt 
|	| j	 | j	}	| |	||}|S )z$create relative positional encoding.r   g       @Nr   r   i'  biunirL   zUnknown `attn_type` {}.g      r   r   rO   )r   r   r&   float32castr   r(   r   r   r   Zclip_by_valueAssertionErrorr   rd   )r3   r   rM   r   rS   Zfreq_seqr   begendZfwd_pos_seqZbwd_pos_seqZfwd_pos_embZbwd_pos_embr   r   r   r   relative_positional_encoding  s@    


z-TFXLNetMainLayer.relative_positional_encodingTFc           %      C   sP  t |ttfr|d }t|dkr,|d n|}t|dkrD|d n|}t|dkr\|d n|}t|dkrt|d n|}t|dkr|d n|}t|dkr|d n|}t|dkr|d n|}t|d	kr|d	 n|	}	t|d
kr|d
 n|
}
t|dkstdnt |ttfr|d}|d|}|d|}|d|}|d|}|d|}|d|}|d|}|d|	}	|d|
}
t|dkstdn|}|d k	r|	d k	rtdnd|d k	rt	j
|dd}t|d d \}}n6|	d k	r$t	j
|	dd}	t|	d d \}}ntd|d k	rDt	j
|ddnd }|d k	r`t	j
|ddnd }|d k	r|t	j
|ddnd }|d k	rt	j
|ddnd }|d k	rt	j
|ddnd }|d k	r|d d k	rt|d d nd}|| }| jrt	jnt	j}| jdkr2| ||}|d d d d d d f }n"| jdkrDd }ntd| j|d ksp|d ksptd |d kr|d k	rd!t	j||d" }|d k	r|d k	r|d  | }n<|d k	r|d kr|d  }n|d kr|d k	r|}nd }|d k	rtt	jt|d ||g|d"}t	j||gdd#}|d krT|d d d d d d d f }n ||d d d d d d d f 7 }|d k	rt	j|dk|d"}|d k	rt	j||d" }t	jt	j||g|d"|gd$d#}t	j||d d d d d d f  dk|d"}nd }|	d k	r
|	}n
| |}| j||d%}|d k	rXt	| jt|d |dg}| j||d%}nd }|d k	rt	j||gt	jd"}t	||gd}t	t	t	|d d d f |d d d f t	j}t	j|d|d"}nd }| j||||d&}| j||d%}|d k	rz| dkr>| d d d d}|!| j"d$d$d$d$}n$| dkrb| d d d}|j#t$| % j&d"}nd g| j" }d'}|d krd gt| j' }g }g } t(| j'D ]\}!}"| j)d k	r| j)dkr|
d(kr|| *|||! f }| j+r| ,|d k	r||fn| |"||||||||! |||! g	|d%}#|#d d \}}| j-r|,|#d  q| j+r| ,|d k	r||fn| | j|d k	r|n||d%}$t	j
|$ddf}#| j)d k	r| j)dkr|
d(kr|#|f }#| j+r(|d k	rtd)d* | D } ntd+d* | D } |#| f }#| j-rLtd,d* |D }|#|f }#|#S )-Nr   r   r   r                  	   
   zToo many inputs.	input_idsattention_maskrf   	perm_maskrg   token_type_ids
input_maskr[   inputs_embeds	use_cachezDYou cannot specify both input_ids and inputs_embeds at the same time)r   r   permr   r   r   z5You have to specify either input_ids or inputs_embeds)r   r   r   r   r   zUnsupported attention type: {}zYou can only use one of input_mask (uses 1 for padding) or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one.r   r   rO   rL   rP   )r   rS   r   Tc                 s   s&   | ]}|D ]}t j|d dV  q
qdS r   r   Nr   	transpose)r   hsr`   r   r   r   	<genexpr>  s       z(TFXLNetMainLayer.call.<locals>.<genexpr>c                 s   s   | ]}t j|d dV  qdS r   r   )r   r   r   r   r   r     s     c                 s   s   | ]}t j|d dV  qdS ))r   r   r   r   r   Nr   )r   tr   r   r   r     s     ).rv   tuplelistrc   r   dictr   getr(   r   r   r   r   Zbfloat16r   r   r   r   r   r@   rd   Zeyer   r1   r   r   int32Zlogical_notequalZone_hotr   ZdimZ	unsqueezeexpandr   tonext
parametersrS   r   	enumerater   r   r   appendr%   )%r3   rU   r   rf   r   rg   r   r   r[   r   r   rQ   r   r   r   r   rM   Zdtype_floatrZ   Z	data_maskZ	mems_maskZnon_tgt_maskZ
word_emb_kri   Z
word_emb_qrj   Zmem_padZcat_idsrY   r   Znew_memsZ
attentionsr   r   Zlayer_modulerk   ra   r   r   r   rl     s   



,


 

 ,



4

""



zTFXLNetMainLayer.call)N)NN)
NNNNNNNNTF)rm   rn   ro   r   config_classr$   r   rF   r   r   r   r   r   r   staticmethodr   r   rl   rp   r   r   r6   r   r   a  s,   

3          r   c                   @   s   e Zd ZdZeZeZdZdS )TFXLNetPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    transformerN)	rm   rn   ro   __doc__r   r   %TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixr   r   r   r   r     s   r   a  

    .. note::

        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Parameters:
        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.XLNetTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
            given to this model should not be passed as input ids as they have already been computed.
        perm_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
            If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
            if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
            If None, each token attends to all the others (full bidirectional attention).
            Only used during pretraining (to define factorization order) or for sequential decoding (generation).
        target_mapping (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to indicate the output tokens to use.
            If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
            Only used during pretraining for partial prediction or for sequential decoding (generation).
        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        input_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
            Kept for compatibility with the original code base.
            You can only uses one of `input_mask` and `attention_mask`
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        use_cache (:obj:`bool`):
            If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`.
z^The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.c                       s,   e Zd Z fddZeedd Z  ZS )TFXLNetModelc                    s&   t  j|f|| t|dd| _d S )Nr   r~   )r#   r$   r   r   r3   r4   rU   r5   r6   r   r   r$     s    zTFXLNetModel.__init__c                 K   s   | j |f|}|S )a(  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import XLNetTokenizer, TFXLNetModel

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = TFXLNetModel.from_pretrained('xlnet-large-cased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        )r   )r3   rU   r5   rk   r   r   r   rl   "  s    #zTFXLNetModel.callrm   rn   ro   r$   r   XLNET_INPUTS_DOCSTRINGrl   rp   r   r   r6   r   r     s   r   zoXLNet Model with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                       s<   e Zd Z fddZdd Zdd Zeedd Z  Z	S )	TFXLNetLMHeadModelc                    s:   t  j|f|| t|dd| _t|| jjdd| _d S )Nr   r~   lm_loss)r#   r$   r   r   r   r   r   r   r6   r   r   r$   O  s    zTFXLNetLMHeadModel.__init__c                 C   s   | j jS r   )r   r   r   r   r   r   get_output_embeddingsT  s    z(TFXLNetLMHeadModel.get_output_embeddingsc                 K   s   |j d }tj|dftjd}tj||gdd}|j d }tj|||d ftjd}tj||dftjd}tj||gdd}tj|d|d ftjd}	tj|ddftjd}
tj|	|
gdd}	|||	|d d}|r||d< |S )	Nr   r   r   rO   rL   r   )rU   r   rg   r   rf   )r:   r   r@   r   rd   r   r   )r3   rU   Zpastr5   Zeffective_batch_sizeZdummy_tokenZsequence_lengthr   Zperm_mask_seq_endrg   Ztarget_mapping_seq_endr   r   r   prepare_inputs_for_generationW  s$    

z0TFXLNetLMHeadModel.prepare_inputs_for_generationc                 K   s6   | j |f|}|d }| |}|f|dd  }|S )a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        import numpy as np
        from transformers import XLNetTokenizer, TFXLNetLMHeadModel

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')

        # We show how to setup inputs to predict a next token using a bi-directional context.
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
        perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
        target_mapping = np.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
        outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))

        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]

        r   r   N)r   r   )r3   rU   r5   transformer_outputsZhidden_statelogitsrk   r   r   r   rl   v  s
    +
zTFXLNetLMHeadModel.call)
rm   rn   ro   r$   r   r   r   r   rl   rp   r   r   r6   r   r   I  s
   r   zXLNet Model with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s,   e Zd Z fddZeedd Z  ZS ) TFXLNetForSequenceClassificationc                    s^   t  j|f|| |j| _t|dd| _t||jdd| _tj	j
j|jt|jdd| _d S )Nr   r~   sequence_summaryr   logits_projrs   )r#   r$   
num_labelsr   r   r   r+   r   r   r,   r-   ru   r	   r   r   r6   r   r   r$     s        z)TFXLNetForSequenceClassification.__init__c                 K   s@   | j |f|}|d }| |}| |}|f|dd  }|S )a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import XLNetTokenizer, TFXLNetForSequenceClassification

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]

        r   r   N)r   r   r   r3   rU   r5   r   ra   r   rk   r   r   r   rl     s    #

z%TFXLNetForSequenceClassification.callr   r   r   r6   r   r     s   r   zXLNet Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. c                       s$   e Zd Z fddZdd Z  ZS )TFXLNetForTokenClassificationc                    sL   t  j|f|| |j| _t|dd| _tjjj|jt	|j
dd| _d S )Nr   r~   
classifierrs   )r#   r$   r   r   r   r   r,   r-   ru   r	   r+   r   r   r6   r   r   r$     s      z&TFXLNetForTokenClassification.__init__c                 K   s6   | j |f|}|d }| |}|f|dd  }|S )a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
            Classification scores (before SoftMax).
        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import XLNetTokenizer, TFXLNetForTokenClassification

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        scores = outputs[0]

        r   r   N)r   r   r   r   r   r   rl     s
    "
z"TFXLNetForTokenClassification.callr{   r   r   r6   r   r     s   	r   zXLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s,   e Zd Z fddZeedd Z  ZS )!TFXLNetForQuestionAnsweringSimplec                    sD   t  j|f|| t|dd| _tjjj|jt	|j
dd| _d S )Nr   r~   
qa_outputsrs   )r#   r$   r   r   r   r,   r-   ru   r   r	   r+   r   r   r6   r   r   r$   +  s      z*TFXLNetForQuestionAnsweringSimple.__init__c           	      K   sh   | j |f|}|d }| |}tj|ddd\}}tj|dd}tj|dd}||f|dd  }|S )ap	  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
            Span-start scores (before SoftMax).
        end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        start_scores, end_scores = outputs[:2]

        r   r   rL   rO   r   N)r   r   r   splitZsqueeze)	r3   rU   r5   r   Zsequence_outputr   Zstart_logitsZ
end_logitsrk   r   r   r   rl   2  s    '

z&TFXLNetForQuestionAnsweringSimple.callr   r   r   r6   r   r   %  s   r   ).r   loggingZnumpyr   Z
tensorflowr   Zconfiguration_xlnetr   Z
file_utilsr   r   Zmodeling_tf_utilsr   r   r   r	   r
   r   Ztokenization_utilsr   	getLoggerrm   loggerr   r   r   r,   r-   Z
ActivationZactivationsr   rx   ZLayerr   rq   r|   r   r   r   ZXLNET_START_DOCSTRINGr   r   r   r   r   r   r   r   r   r   <module>   sl    
	 `  \
8,\;6