U
    &ºcó7 ã                   @   sÊ  d Z ddlZddlZddlmZ ddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZ e e¡ZdddœZd1dd„Zdd„ Ze
ejjjedœZejZ G dd„ dej!ƒZ"G dd„ dej!ƒZ#G dd„ dej!ƒZ$G dd„ deƒZ%dZ&dZ'ede&ƒG dd„ de%ƒƒZ(ed e&ƒG d!d"„ d"e%ƒƒZ)ed#e&ƒG d$d%„ d%e%ƒƒZ*ed&e&ƒG d'd(„ d(e%ƒƒZ+ed)e&ƒG d*d+„ d+e%ƒƒZ,ed,e&ƒG d-d.„ d.e%ƒƒZ-ed,e&ƒG d/d0„ d0e%ƒƒZ.dS )2z PyTorch XLNet model.
é    N)Únn)ÚCrossEntropyLossÚMSELoss)Ú
functionalé   )Úgelu_newÚswish)ÚXLNetConfig)Úadd_start_docstringsÚ add_start_docstrings_to_callable)ÚPoolerAnswerClassÚPoolerEndLogitsÚPoolerStartLogitsÚPreTrainedModelÚSequenceSummaryz=https://cdn.huggingface.co/xlnet-base-cased-pytorch_model.binz>https://cdn.huggingface.co/xlnet-large-cased-pytorch_model.bin)zxlnet-base-casedzxlnet-large-casedc                 C   s$  i }t | dƒr¤t | dƒr$| jj|d< t | dƒrRd|krR| jjj|d< | jjj|d< t | dƒrž|jdk	ržd	 |j¡|krž| jj|d	 |j¡< | jj|d
 |j¡< | j	} | 
| jj| jdœ¡ t| jƒD ]Æ\}}d| }| 
|d |jjj|d |jjj|d |jj|d |jj|d |jj|d |jj|d |jj|d |jjj|d |jjj|d |jjj|d |jjj|d |jjj|d |jjji¡ qÄ|jrìg }g }g }	g }
| jD ]>}| |jj¡ | |jj¡ |	 |jj¡ |
 |jj¡ qªn | jg}| jg}| jg}	| jg}
| 
|||	|
dœ¡ |S )z˜ A map of modules from TF to PyTorch.
        I use a map to keep the PyTorch model as
        identical to the original PyTorch model as possible.
    ÚtransformerÚlm_losszmodel/lm_loss/biasÚsequence_summaryz%model/sequnece_summary/summary/kernelz#model/sequnece_summary/summary/biasÚlogits_projNz model/regression_{}/logit/kernelzmodel/regression_{}/logit/bias)z-model/transformer/word_embedding/lookup_tablez#model/transformer/mask_emb/mask_embzmodel/transformer/layer_%d/zrel_attn/LayerNorm/gammazrel_attn/LayerNorm/betazrel_attn/o/kernelzrel_attn/q/kernelzrel_attn/k/kernelzrel_attn/r/kernelzrel_attn/v/kernelzff/LayerNorm/gammazff/LayerNorm/betazff/layer_1/kernelzff/layer_1/biaszff/layer_2/kernelzff/layer_2/bias)zmodel/transformer/r_r_biaszmodel/transformer/r_w_biaszmodel/transformer/r_s_biaszmodel/transformer/seg_embed)Úhasattrr   Úbiasr   ÚsummaryÚweightZfinetuning_taskÚformatr   r   ÚupdateÚword_embeddingÚmask_embÚ	enumerateÚlayerÚrel_attnÚ
layer_normÚoÚqÚkÚrÚvÚffÚlayer_1Úlayer_2Zuntie_rÚappendÚr_r_biasÚr_w_biasÚr_s_biasÚ	seg_embed)ÚmodelÚconfigÚ
tf_weightsÚtf_to_pt_mapÚiÚbZ	layer_strZr_r_listZr_w_listZr_s_listZseg_embed_list© r4   ú?/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_xlnet.pyÚbuild_tf_xlnet_to_pytorch_map)   s”    

ÿþýþÿ             óÿ
üÿr6   c                 C   sx  zddl }ddl}W n  tk
r4   t d¡ ‚ Y nX |j |¡}i }|D ]0\}}t d ||¡¡ |j 	||¡}	|	||< qJt
| ||ƒ}
|
 ¡ D ]Æ\}}t d |¡¡ ||krÄt d |¡¡ q|| }	d|krd|ksðd	|ksðd
|krt d¡ | |	¡}	t|tƒr¼t|ƒ|	jd ks(t‚t|ƒD ]ˆ\}}|	|df }z|j|jksXt‚W n< tk
r– } z| j|j|jf7  _‚ W 5 d}~X Y nX t d ||¡¡ t |¡|_q0npz|j|	jksÐt‚W n< tk
r } z| j|j|	jf7  _‚ W 5 d}~X Y nX t d |¡¡ t |	¡|_| |d¡ | |d d¡ | |d d¡ qt d d | ¡ ¡¡¡ | S )z, Load tf checkpoints in a pytorch model
    r   NzšLoading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z"Loading TF weight {} with shape {}zImporting {}z*{} not in tf pre-trained weights, skippingÚkernelr&   r   ZlogitZTransposing.z)Initialize PyTorch weight {} for layer {}zInitialize PyTorch weight {}z/Adamz/Adam_1z'Weights not copied to PyTorch model: {}z, )ZnumpyZ
tensorflowÚImportErrorÚloggerÚerrorZtrainZlist_variablesÚinfor   Zload_variabler6   ÚitemsÚ	transposeÚ
isinstanceÚlistÚlenÚshapeÚAssertionErrorr   ÚargsÚtorchZ
from_numpyÚdataÚpopÚjoinÚkeys)r.   r/   Ztf_pathÚnpÚtfZ	init_varsr0   ÚnamerA   Úarrayr1   Úpointerr2   Zp_iZarr_iÚer4   r4   r5   Úload_tf_weights_in_xlnet|   s^    ÿ
$

rO   )ZgeluÚrelur   c                       s^   e Zd Z‡ fdd„Zdd„ Zeddd„ƒZeddd	„ƒZddd„Zddd„Z	ddd„Z
‡  ZS )ÚXLNetRelativeAttentionc                    sp  t ƒ  ¡  |j| _|j|j dkr6td|j|jf ƒ‚|j| _|j| _|j| _d|jd  | _t 	t
 |j| j| j¡¡| _t 	t
 |j| j| j¡¡| _t 	t
 |j| j| j¡¡| _t 	t
 |j| j| j¡¡| _t 	t
 |j| j| j¡¡| _t 	t
 | j| j¡¡| _t 	t
 | j| j¡¡| _t 	t
 | j| j¡¡| _t 	t
 d| j| j¡¡| _t|j|jd| _t |j¡| _d S )Nr   zLThe hidden size (%d) is not a multiple of the number of attention heads (%d)r   ç      à?é   ©Zeps)ÚsuperÚ__init__Úoutput_attentionsÚd_modelZn_headÚ
ValueErrorZd_headÚscaler   Ú	ParameterrD   ÚFloatTensorr"   r#   r%   r!   r$   r*   r,   r+   r-   ÚXLNetLayerNormÚlayer_norm_epsr    ÚDropoutÚdropout©Úselfr/   ©Ú	__class__r4   r5   rV   Á   s.    

ÿÿzXLNetRelativeAttention.__init__c                 C   s   t ‚d S ©N©ÚNotImplementedError)rb   Zheadsr4   r4   r5   Úprune_headsÞ   s    z"XLNetRelativeAttention.prune_headséÿÿÿÿc              	   C   s|   | j }|  |d |d |d |d ¡} | dd…df } |  |d |d d |d |d ¡} t | dtj|| jtjd¡} | S )z<perform relative shift to form the relative attention score.r   r   rS   é   N.©ÚdeviceÚdtype©rA   ZreshaperD   Zindex_selectÚarangerl   Úlong©ÚxÚklenZx_sizer4   r4   r5   Ú	rel_shiftá   s     $z XLNetRelativeAttention.rel_shiftc              	   C   sŒ   | j }|  |d |d |d |d ¡} | d d …d d …dd …d d …f } |  |d |d |d |d d ¡} t | dtj|| jtjd¡} | S )Nr   r   rj   rS   rk   rn   rq   r4   r4   r5   Úrel_shift_bnijî   s      $z%XLNetRelativeAttention.rel_shift_bnijNc                 C   s  t  d|| j |¡}t  d|| j |¡}	| j|	|jd d}	|dkrJd}
n$t  d|| j | j¡}
t  d||
¡}
||	 |
 | j }|dk	r¾|j	t j
krª|dt  d	|¡  }n|d
t  d	|¡  }tj|dd}|  |¡}|dk	rî|t  d	|¡ }t  d||¡}| jr|t  d|¡fS |S )z.Core relative positional attention operations.zibnd,jbnd->bnijrj   )rs   Nr   zibnd,snd->ibnszijbs,ibns->bnijiÜÿ  z
ijbn->bnijgêŒ 9Y>)F©Údimzbnij,jbnd->ibndz
bnij->ijbn)rD   Úeinsumr+   r*   ru   rA   r,   r-   rZ   rm   Zfloat16ÚFÚsoftmaxr`   rW   )rb   Zq_headÚk_head_hÚv_head_hÚk_head_rÚseg_matÚ	attn_maskÚ	head_maskÚacZbdZefZ
attn_scoreÚ	attn_probÚattn_vecr4   r4   r5   Úrel_attn_coreý   s(    
z$XLNetRelativeAttention.rel_attn_coreTc                 C   s4   t  d|| j¡}|  |¡}|r&|| }|  |¡}|S )zPost-attention processing.zibnd,hnd->ibh)rD   rx   r!   r`   r    )rb   Úhrƒ   ZresidualZattn_outÚoutputr4   r4   r5   Úpost_attention'  s    

z%XLNetRelativeAttention.post_attentionc
              	   C   s  |d k	rL|d k	r2|  ¡ dkr2tj||gdd}
n|}
t d|
| j¡}t d|
| j¡}t d|| j¡}t d|| j¡}| j|||||||	d}| j	rœ|\}}|  
||¡}t d|| j¡}|d k	rt d||¡}| j|||||||	d}| j	rö|\}}t d||¡}n(| j|||||||	d}| j	r.|\}}|  
||¡}| j	rô||f}n¨|d k	rx|  ¡ dkrxtj||gdd}
n|}
t d|| j¡}t d|
| j¡}t d|
| j¡}t d|| j¡}| j|||||||	d}| j	rä|\}}|  
||¡}d }||f}| j	r||f }|S )Nr   r   rv   zibh,hnd->ibnd)r~   r   r€   zmbnd,mlb->lbndzlbnd,mlb->mbnd)rw   rD   Úcatrx   r#   r%   r$   r"   r„   rW   r‡   )rb   r…   ÚgÚattn_mask_hÚattn_mask_gr$   r~   ÚmemsÚtarget_mappingr€   rˆ   r{   r|   r}   Zq_head_hZ
attn_vec_hZattn_prob_hÚoutput_hZq_head_gZ
attn_vec_gZattn_prob_gÚoutput_gr‚   rƒ   Úoutputsr4   r4   r5   Úforward3  s’    
      ÿ
      ÿ      ÿ
      ÿ
zXLNetRelativeAttention.forward)ri   )ri   )NNN)T)NNN)Ú__name__Ú
__module__Ú__qualname__rV   rh   Ústaticmethodrt   ru   r„   r‡   r‘   Ú__classcell__r4   r4   rc   r5   rQ   À   s   
*
rQ   c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚXLNetFeedForwardc                    st   t ƒ  ¡  t|j|jd| _t |j|j¡| _	t |j|j¡| _
t |j¡| _t|jtƒrht|j | _n|j| _d S )NrT   )rU   rV   r]   rX   r^   r    r   ÚLinearZd_innerr'   r(   r_   r`   r>   Zff_activationÚstrÚACT2FNÚactivation_functionra   rc   r4   r5   rV   “  s    
zXLNetFeedForward.__init__c                 C   sH   |}|   |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  || ¡}|S re   )r'   r›   r`   r(   r    )rb   Úinpr†   r4   r4   r5   r‘   ž  s    




zXLNetFeedForward.forward©r’   r“   r”   rV   r‘   r–   r4   r4   rc   r5   r—   ’  s   r—   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )Ú
XLNetLayerc                    s0   t ƒ  ¡  t|ƒ| _t|ƒ| _t |j¡| _d S re   )	rU   rV   rQ   r   r—   r&   r   r_   r`   ra   rc   r4   r5   rV   ª  s    


zXLNetLayer.__init__Nc
                 C   s`   | j |||||||||	d	}
|
d d… \}}|d k	r>|  |¡}|  |¡}||f|
dd …  }
|
S )N)rŒ   r   r€   rS   )r   r&   )rb   rŽ   r   rŠ   r‹   r$   r~   rŒ   r   r€   r   r4   r4   r5   r‘   °  s"    ÷

zXLNetLayer.forward)NNNr   r4   r4   rc   r5   rž   ©  s
        ÿrž   c                   @   s(   e Zd ZdZeZeZeZ	dZ
dd„ ZdS )ÚXLNetPreTrainedModelz† An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    r   c              	   C   sä   t |tjtjfƒrL|jjjd| jjd t |tjƒrà|j	dk	rà|j	j 
¡  n”t |tƒrr|j	j 
¡  |jj d¡ nnt |tƒrÀ|j|j|j|j|j|j|j|j|jf	D ]}|jjd| jjd q¤n t |tƒrà|jjjd| jjd dS )z! Initialize the weights.
        g        )ZmeanZstdNç      ð?)r>   r   r˜   Ú	Embeddingr   rE   Znormal_r/   Zinitializer_ranger   Zzero_r]   Zfill_rQ   r"   r#   r%   r!   r$   r*   r,   r+   r-   Ú
XLNetModelr   )rb   ÚmoduleÚparamr4   r4   r5   Ú_init_weightsÒ  s*    

÷
z"XLNetPreTrainedModel._init_weightsN)r’   r“   r”   Ú__doc__r	   Zconfig_classÚ"XLNET_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_maprO   Zload_tf_weightsZbase_model_prefixr¥   r4   r4   r4   r5   rŸ   È  s   rŸ   ao  

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a'  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
            given to this model should not be passed as input ids as they have already been computed.
            `use_cache` has to be set to `True` to make use of `mems`.
        perm_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
            If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
            if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
            If None, each token attends to all the others (full bidirectional attention).
            Only used during pretraining (to define factorization order) or for sequential decoding (generation).
        target_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to indicate the output tokens to use.
            If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
            Only used during pretraining for partial prediction or for sequential decoding (generation).
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token. The classifier token should be represented by a ``2``.

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        input_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
            Kept for compatibility with the original code base.
            You can only uses one of `input_mask` and `attention_mask`
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        use_cache (:obj:`bool`):
            If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`.
z_The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.c                       sn   e Zd Z‡ fdd„Zdd„ Zdd„ Zdd„ Zd	d
„ Zdd„ Ze	ddd„ƒZ
ddd„Zeeƒddd„ƒZ‡  ZS )r¢   c                    sÀ   t ƒ  ˆ ¡ ˆ j| _ˆ j| _ˆ j| _ˆ j| _ˆ j| _ˆ j| _ˆ j| _ˆ j	| _	ˆ j
| _
ˆ j| _t ˆ jˆ j¡| _t t ddˆ j¡¡| _t ‡ fdd„tˆ jƒD ƒ¡| _t ˆ j¡| _|  ¡  d S )Nr   c                    s   g | ]}t ˆ ƒ‘qS r4   )rž   )Ú.0Ú_©r/   r4   r5   Ú
<listcomp>I  s     z'XLNetModel.__init__.<locals>.<listcomp>)rU   rV   rW   Úoutput_hidden_statesÚmem_lenÚ	reuse_lenrX   Úsame_lengthÚ	attn_typeÚbi_dataÚ	clamp_lenÚn_layerr   r¡   Ú
vocab_sizer   r[   rD   r\   r   Z
ModuleListÚranger   r_   r`   Úinit_weightsra   rc   rª   r5   rV   9  s      zXLNetModel.__init__c                 C   s   | j S re   ©r   ©rb   r4   r4   r5   Úget_input_embeddingsN  s    zXLNetModel.get_input_embeddingsc                 C   s
   || _ d S re   r·   )rb   Znew_embeddingsr4   r4   r5   Úset_input_embeddingsQ  s    zXLNetModel.set_input_embeddingsc                 C   s   t ‚d S re   rf   )rb   Zheads_to_pruner4   r4   r5   Ú_prune_headsT  s    zXLNetModel._prune_headsc                 C   sœ   t  ||g¡}t j|dd}t  ||g¡}t j||gdd}| jr†t j|dd}t j|dd…d|…f | |dd…|d…f gdd}| t|  	¡ ƒ¡}|S )aw  
        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.

        Args:
            qlen: Sequence length
            mlen: Mask length

        ::

                  same_length=False:      same_length=True:
                  <mlen > <  qlen >       <mlen > <  qlen >
               ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
                 [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
            qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
                 [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
               v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]

        r   )Zdiagonalrv   ri   N)
rD   ZonesZtriuÚzerosrˆ   r¯   ZtrilÚtoÚnextÚ
parameters)rb   ÚqlenÚmlenr   Zmask_upZattn_mask_padÚretZmask_lor4   r4   r5   Úcreate_maskW  s    6zXLNetModel.create_maskc                 C   sb   | j d k	r"| j dkr"|d | j … }|d kr<|| j d … }ntj||gdd| j d … }| ¡ S )Nr   rv   )r®   r­   rD   rˆ   Údetach)rb   Zcurr_outZprev_memZnew_memr4   r4   r5   Ú	cache_memu  s    zXLNetModel.cache_memNc                 C   s\   t  d| |¡}t jt  |¡t  |¡gdd}|d d …d d d …f }|d k	rX| d|d¡}|S )Nzi,d->idri   rv   )rD   rx   rˆ   ÚsinÚcosÚexpand)Zpos_seqÚinv_freqÚbszZsinusoid_inpÚpos_embr4   r4   r5   Úpositional_embedding  s    zXLNetModel.positional_embeddingc                 C   sv  t jd| jdt jd}dt  d|| j ¡ }| jdkrD||  }}n&| jdkrZ|d }}ntd	 | j¡ƒ‚| jr&t j||d
t jd}t j| | dt jd}	| j	dkrÌ| 
| j	 | j	¡}|	 
| j	 | j	¡}	|d k	rú|  |||d ¡}
|  |	||d ¡}n|  ||¡}
|  |	|¡}t j|
|gdd}n:t  ||d
¡}| j	dkrR| 
| j	 | j	¡}|  |||¡}| t|  ¡ ƒ¡}|S )Nr   g       @©rm   r   i'  ÚbiÚuniri   zUnknown `attn_type` {}.g      ð¿r    rS   rv   )rD   ro   rX   ÚfloatÚpowr°   rY   r   r±   r²   ÚclamprÌ   rˆ   r½   r¾   r¿   )rb   rÀ   rs   rÊ   Zfreq_seqrÉ   ÚbegÚendZfwd_pos_seqZbwd_pos_seqZfwd_pos_embZbwd_pos_embrË   r4   r4   r5   Úrelative_positional_encodingŒ  s2    


z'XLNetModel.relative_positional_encodingTc           $      C   sÆ  |dk	r|	dk	rt dƒ‚nh|dk	rJ| dd¡ ¡ }|jd |jd  }}n8|	dk	rz|	 dd¡ ¡ }	|	jd |	jd  }}nt dƒ‚|dk	rš| dd¡ ¡ nd}|dk	r¶| dd¡ ¡ nd}|dk	rÒ| dd¡ ¡ nd}|dk	rð| ddd¡ ¡ nd}|dk	r| ddd¡ ¡ nd}|dk	r:|d dk	r:|d jd nd}|| }t|  ¡ ƒj}t|  ¡ ƒj}| j	dkr”|  
||¡}|dd…dd…ddf }n"| j	dkr¦d}nt d	 | j	¡ƒ‚|dksÒ|dksÒtd
ƒ‚|dkrî|dk	rîd| }|dk	r|dk	r|d | }n<|dk	r.|dkr.|d }n|dkrH|dk	rH|}nd}|dk	rÖ|dkrŽt |jd ||g¡ |¡}tj||gdd}|dkr¶|dd…dd…dd…df }n ||dd…dd…dd…df 7 }|dk	rî|dk |¡}|dk	r^t |¡ |¡ }|dkr6tjt ||g¡ |¡|gdd}||dd…dd…ddf  dk |¡}nd}|	dk	rr|	}n
|  |¡}|  |¡}|dk	r²| j |jd |d¡}|  |¡}nd}|dk	r2|dkrôtj||gtj|d}tj||gdd}n|}|dd…df |ddd…f k ¡ }tj|dd |¡}nd}| j|||d}|  |¡}|dk	rÖ| ¡ dkrš| d¡ d¡ d¡ d¡}| | jdddd¡}n$| ¡ dkr¾| d¡ d¡ d¡}|jt|  ¡ ƒjd}ndg| j }d}|dkr dgt| jƒ }g }g }t| jƒD ]®\} }!| j dk	rR| j dkrR|
dkrR||  !|||  ¡f }| j"rv| #|dk	rp||fn|¡ |!||||||||  |||  d	}"|"dd… \}}| j$r| #|"d ¡ q| j"ræ| #|dk	rà||fn|¡ |  |dk	rø|n|¡}#|# ddd¡ ¡ f}"| j dk	r>| j dkr>|
dkr>|"|f }"| j"r€|dk	rdt%dd„ |D ƒƒ}nt%dd„ |D ƒƒ}|"|f }"| j$rÂ|dk	r¦t%dd„ |D ƒƒ}nt%dd„ |D ƒƒ}|"|f }"|"S )a«  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLNetTokenizer, XLNetModel
        import torch

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = XLNetModel.from_pretrained('xlnet-large-cased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0)  # Batch size 1

        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   r   z5You have to specify either input_ids or inputs_embedsrS   rÏ   rÎ   zUnsupported attention type: {}z8You can only use one of input_mask (uses 1 for padding) r    rv   ri   ©rm   rl   )Znum_classes)rÊ   rÍ   r4   T)rŠ   r‹   r$   r~   rŒ   r   r€   c                 s   s*   | ]"}|D ]}|  d dd¡ ¡ V  q
qdS ©r   r   rS   N©ÚpermuteÚ
contiguous)r¨   Úhsr…   r4   r4   r5   Ú	<genexpr>‰  s       z%XLNetModel.forward.<locals>.<genexpr>c                 s   s    | ]}|  d dd¡ ¡ V  qdS r×   rØ   )r¨   rÛ   r4   r4   r5   rÜ   ‹  s     c                 s   s    | ]}t d d„ |D ƒƒV  qdS )c                 s   s"   | ]}|  d ddd¡ ¡ V  qdS ©rS   rj   r   r   NrØ   )r¨   Z
att_streamr4   r4   r5   rÜ   ‘  s     z/XLNetModel.forward.<locals>.<genexpr>.<genexpr>N)Útuple©r¨   Útr4   r4   r5   rÜ     s    c                 s   s"   | ]}|  d ddd¡ ¡ V  qdS rÝ   rØ   rß   r4   r4   r5   rÜ   ”  s     )&rY   r=   rÚ   rA   rÙ   r¾   r¿   rm   rl   r°   rÃ   r   rB   rD   r¼   r½   rˆ   Zeyer   r`   r   rÈ   rp   ry   Zone_hotrÕ   rw   Ú	unsqueezer³   r@   r   r   r­   rÅ   r¬   r)   rW   rÞ   )$rb   Ú	input_idsÚattention_maskrŒ   Ú	perm_maskr   Útoken_type_idsÚ
input_maskr€   Úinputs_embedsÚ	use_cacherÀ   rÊ   rÁ   rs   Zdtype_floatrl   r   Z	data_maskZ	mems_maskZnon_tgt_maskZ
word_emb_krŽ   Z
word_emb_qr   Zmem_padZcat_idsr~   rË   Znew_memsZ
attentionsÚhidden_statesr2   Zlayer_moduler   r†   r4   r4   r5   r‘   ³  sð    5
 *



 


"(





$

ÿ
"÷"



ÿ

zXLNetModel.forward)N)N)
NNNNNNNNNT)r’   r“   r”   rV   r¹   rº   r»   rÃ   rÅ   r•   rÌ   rÕ   r   ÚXLNET_INPUTS_DOCSTRINGr‘   r–   r4   r4   rc   r5   r¢   4  s*   

'          õr¢   zoXLNet Model with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                       s>   e Zd Z‡ fdd„Zdd„ Zdd„ Zeeƒdd	d
„ƒZ‡  Z	S )ÚXLNetLMHeadModelc                    sH   t ƒ  |¡ |j| _|j| _t|ƒ| _tj|j|j	dd| _
|  ¡  d S )NT)r   )rU   rV   r°   r¯   r¢   r   r   r˜   rX   r´   r   r¶   ra   rc   r4   r5   rV      s    
zXLNetLMHeadModel.__init__c                 C   s   | j S re   )r   r¸   r4   r4   r5   Úget_output_embeddingsª  s    z&XLNetLMHeadModel.get_output_embeddingsc           
      K   s²   |j d }tj|dftj|jd}tj||gdd}|j d }tj|||ftj|jd}d|d d …d d …df< tj|d|ftj|jd}d|d< ||||d d	œ}	|r®||	d
< |	S )Nr   r   rÖ   rv   r    ri   )r   r   ri   rè   )râ   rä   r   rè   rŒ   )rA   rD   r¼   rp   rl   rˆ   rÐ   )
rb   râ   ZpastÚkwargsZeffective_batch_sizeZdummy_tokenZsequence_lengthrä   r   Úinputsr4   r4   r5   Úprepare_inputs_for_generation­  s0    

  ÿ  ÿüz.XLNetLMHeadModel.prepare_inputs_for_generationNTc                 C   sx   | j |||||||||	|
d
}|  |d ¡}|f|dd…  }|dk	rttƒ }|| d| d¡¡| d¡ƒ}|f| }|S )aÈ  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
            Labels for masked language modeling.
            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
            The labels should correspond to the masked input words that should be predicted and depends on `target_mapping`. Note in order to perform standard auto-regressive language modeling a `<mask>` token has to be added to the `input_ids` (see `prepare_inputs_for_generation` fn and examples below)
            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored, the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
            Language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLNetTokenizer, XLNetLMHeadModel
        import torch

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')

        # We show how to setup inputs to predict a next token using a bi-directional context.
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)

        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]

        # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
        labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
        assert labels.shape[0] == 1, 'only one word will be predicted'
        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token as is done in standard auto-regressive lm training
        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)

        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
        loss, next_token_logits = outputs[:2]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]

        ©	rã   rŒ   rä   r   rå   ræ   r€   rç   rè   r   r   Nri   )r   r   r   ÚviewÚsize)rb   râ   rã   rŒ   rä   r   rå   ræ   r€   rç   rè   ÚlabelsÚtransformer_outputsÚlogitsr   Úloss_fctÚlossr4   r4   r5   r‘   Î  s&    Nö
zXLNetLMHeadModel.forward)NNNNNNNNNTN)
r’   r“   r”   rV   rì   rï   r   rê   r‘   r–   r4   r4   rc   r5   rë   š  s    
!           ôrë   zˆXLNet Model with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s.   e Zd Z‡ fdd„Zeeƒddd„ƒZ‡  ZS )ÚXLNetForSequenceClassificationc                    sF   t ƒ  |¡ |j| _t|ƒ| _t|ƒ| _t |j	|j¡| _
|  ¡  d S re   )rU   rV   Ú
num_labelsr¢   r   r   r   r   r˜   rX   r   r¶   ra   rc   r4   r5   rV   <  s    

z'XLNetForSequenceClassification.__init__NTc                 C   sª   | j |||||||||	|
d
}|d }|  |¡}|  |¡}|f|dd…  }|dk	r¦| jdkr|tƒ }|| d¡| d¡ƒ}n tƒ }|| d| j¡| d¡ƒ}|f| }|S )a¹
  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`)
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLNetTokenizer, XLNetForSequenceClassification
        import torch

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

        rð   r   r   Nri   )r   r   r   rù   r   rñ   r   )rb   râ   rã   rŒ   rä   r   rå   ræ   r€   rç   rè   ró   rô   r†   rõ   r   rö   r÷   r4   r4   r5   r‘   F  s0    :ö



z&XLNetForSequenceClassification.forward)NNNNNNNNNTN©r’   r“   r”   rV   r   rê   r‘   r–   r4   r4   rc   r5   rø   6  s   
           ôrø   z›XLNet Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. c                       s.   e Zd Z‡ fdd„Zeeƒddd„ƒZ‡  ZS )ÚXLNetForTokenClassificationc                    s<   t ƒ  |¡ |j| _t|ƒ| _t |j|j¡| _|  	¡  d S re   )
rU   rV   rù   r¢   r   r   r˜   Úhidden_sizeÚ
classifierr¶   ra   rc   r4   r5   rV   ¦  s
    
z$XLNetForTokenClassification.__init__NTc                 C   sÊ   | j |||||||||	|
d
}|d }|  |¡}|f|dd…  }|dk	rÆtƒ }|dk	r¢| d¡dk}| d| j¡}t || d¡t |j¡ 	|¡¡}|||ƒ}n|| d| j¡| d¡ƒ}|f| }|S )a
  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
            Classification loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:(batch_size, config.num_labels)`):
            Classification scores (before SoftMax).
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLNetTokenizer, XLNetForTokenClassification
        import torch

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        scores = outputs[0]

        rð   r   r   Nri   )
r   rý   r   rñ   rù   rD   ÚwhereZtensorÚignore_indexZtype_as)rb   râ   rã   rŒ   rä   r   rå   ræ   r€   rç   rè   ró   r   Úsequence_outputrõ   rö   Zactive_lossZactive_logitsZactive_labelsr÷   r4   r4   r5   r‘   ¯  s:    ;ö
  ÿ
z#XLNetForTokenClassification.forward)NNNNNNNNNTNrú   r4   r4   rc   r5   rû      s   	           ôrû   z—XLNet Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RACE/SWAG tasks. c                       s.   e Zd Z‡ fdd„Zeeƒddd„ƒZ‡  ZS )ÚXLNetForMultipleChoicec                    s<   t ƒ  |¡ t|ƒ| _t|ƒ| _t |jd¡| _	|  
¡  d S )Nr   )rU   rV   r¢   r   r   r   r   r˜   rX   r   r¶   ra   rc   r4   r5   rV     s
    

zXLNetForMultipleChoice.__init__NTc                 C   sú   |j d }| d| d¡¡}|dk	r6| d| d¡¡nd}|dk	rT| d| d¡¡nd}|dk	rr| d| d¡¡nd}| j|||||||||	|
d
}|d }|  |¡}|  |¡}| d|¡}|f|dd…  }|dk	rötƒ }||| d¡ƒ}|f| }|S )a²
  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided):
            Classification loss.
        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).

            Classification scores (before SoftMax).
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLNetTokenizer, XLNetForMultipleChoice
        import torch

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')

        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1

        outputs = model(input_ids, labels=labels)
        loss, classification_scores = outputs[:2]

        r   ri   N)	rå   ræ   rã   rŒ   rä   r   r€   rç   rè   r   )rA   rñ   rò   r   r   r   r   )rb   râ   rå   ræ   rã   rŒ   rä   r   r€   rç   rè   ró   Znum_choicesZflat_input_idsZflat_token_type_idsZflat_attention_maskZflat_input_maskrô   r†   rõ   Zreshaped_logitsr   rö   r÷   r4   r4   r5   r‘     s:    =
ö

ÿ

zXLNetForMultipleChoice.forward)NNNNNNNNNTNrú   r4   r4   rc   r5   r    s   	           ôr  zÕXLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s.   e Zd Z‡ fdd„Zeeƒddd„ƒZ‡  ZS )ÚXLNetForQuestionAnsweringSimplec                    s<   t ƒ  |¡ |j| _t|ƒ| _t |j|j¡| _|  	¡  d S re   )
rU   rV   rù   r¢   r   r   r˜   rü   Ú
qa_outputsr¶   ra   rc   r4   r5   rV   „  s
    
z(XLNetForQuestionAnsweringSimple.__init__NTc                 C   s  | j |||||||||	|
d
}|d }|  |¡}|jddd\}}| d¡}| d¡}||f|dd…  }|dk	r|dk	rt| ¡ ƒdkr˜| d¡}t| ¡ ƒdkr²| d¡}| d¡}| d|¡ | d|¡ t|d}|||ƒ}|||ƒ}|| d }|f| }|S )	a  
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-start scores (before SoftMax).
        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLNetTokenizer, XLNetForQuestionAnsweringSimple
        import torch

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])

        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]

        rð   r   r   ri   rv   rS   N)rÿ   )r   r  ÚsplitZsqueezer@   rò   Zclamp_r   )rb   râ   rã   rŒ   rä   r   rå   ræ   r€   rç   rè   Ústart_positionsÚend_positionsr   r   rõ   Ústart_logitsÚ
end_logitsZignored_indexrö   Ú
start_lossÚend_lossÚ
total_lossr4   r4   r5   r‘     s@    Cö









z'XLNetForQuestionAnsweringSimple.forward)NNNNNNNNNTNNrú   r4   r4   rc   r5   r  ~  s   	            ór  c                       s.   e Zd Z‡ fdd„Zeeƒddd„ƒZ‡  ZS )ÚXLNetForQuestionAnsweringc                    sP   t ƒ  |¡ |j| _|j| _t|ƒ| _t|ƒ| _t|ƒ| _	t
|ƒ| _|  ¡  d S re   )rU   rV   Ústart_n_topÚ	end_n_topr¢   r   r   r  r   r  r   Úanswer_classr¶   ra   rc   r4   r5   rV   ÿ  s    



z"XLNetForQuestionAnswering.__init__NTc           )      C   s(  | j |||||||||	|
d
}|d }| j||d}|dd… }|dk	r|dk	r||||fD ]"}|dk	r`| ¡ dkr`| d¡ q`| j|||d}tƒ }|||ƒ}|||ƒ}|| d }|dk	rø|dk	rø| j|||d	}t ¡ }|||ƒ}||d
 7 }|f| }n| 	¡ \}}}t
j|dd} tj| | jdd\}!}"|" d¡ dd|¡}#t |d|#¡}$|$ d¡ d|dd¡}$| d¡ |$¡}%|dk	r”| d¡nd}| j|%|$|d}t
j|dd}&tj|&| jdd\}'}(|' d| j| j ¡}'|( d| j| j ¡}(t d|| ¡}$| j||$|d}|!|"|'|(|f| }|S )aV  
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
            Labels whether a question has an answer or no answer (SQuAD 2.0)
        cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
        p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
            1.0 means token should be masked. 0.0 mean token is not masked.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
        start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
        end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Log probabilities for the ``is_impossible`` label of the answers.
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLNetTokenizer, XLNetForQuestionAnswering
        import torch

        tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
        model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]

        rð   r   )Úp_maskr   Nri   )r  r  rS   )r  Ú	cls_indexrR   rv   éþÿÿÿ)Ústart_statesr  z
blh,bl->bh)r  r  )r   r  rw   Zsqueeze_r  r   r  r   ZBCEWithLogitsLossrò   ry   rz   rD   Ztopkr  rá   rÈ   ZgatherZ	expand_asr  rñ   rx   ))rb   râ   rã   rŒ   rä   r   rå   ræ   r€   rç   rè   r  r  Zis_impossibler  r  rô   ré   r  r   rr   r  rö   r	  r
  r  Z
cls_logitsZloss_fct_clsZcls_lossrÊ   ÚslenZhszZstart_log_probsZstart_top_log_probsZstart_top_indexZstart_top_index_expr  Zhidden_states_expandedZend_log_probsZend_top_log_probsZend_top_indexr4   r4   r5   r‘     s‚    Qö


  ÿ

ÿ  ÿ
  ÿ  ÿz!XLNetForQuestionAnswering.forward)NNNNNNNNNTNNNNNrú   r4   r4   rc   r5   r  ù  s$                  ðr  )N)/r¦   ÚloggingrD   r   Ztorch.nnr   r   r   ry   Zactivationsr   r   Zconfiguration_xlnetr	   Z
file_utilsr
   r   Zmodeling_utilsr   r   r   r   r   Ú	getLoggerr’   r9   r§   r6   rO   rP   rš   Z	LayerNormr]   ÚModulerQ   r—   rž   rŸ   ZXLNET_START_DOCSTRINGrê   r¢   rë   rø   rû   r  r  r  r4   r4   r4   r5   Ú<module>   s|   
þ
S> S'9þ  dý ýeýhýlývý