U
    &cX                     @   s  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZ eeZddddddddddd
Zdd Zd6ddZG dd dejZ G dd dejZ!G dd deZ"d Z#d!Z$ed"e#G d#d$ d$e"Z%G d%d& d&ejZ&ed'e#G d(d) d)e"Z'ed*e#G d+d, d,e"Z(ed-e#G d.d/ d/e"Z)ed0e#G d1d2 d2e"Z*ed3e#G d4d5 d5e"Z+dS )7z PyTorch XLM model.
    N)nn)CrossEntropyLossMSELoss)
functional   )gelu)	XLMConfig)add_start_docstrings add_start_docstrings_to_callable)PreTrainedModelSequenceSummary	SQuADHeadprune_linear_layerz<https://cdn.huggingface.co/xlm-mlm-en-2048-pytorch_model.binz>https://cdn.huggingface.co/xlm-mlm-ende-1024-pytorch_model.binz>https://cdn.huggingface.co/xlm-mlm-enfr-1024-pytorch_model.binz>https://cdn.huggingface.co/xlm-mlm-enro-1024-pytorch_model.binzDhttps://cdn.huggingface.co/xlm-mlm-tlm-xnli15-1024-pytorch_model.binz@https://cdn.huggingface.co/xlm-mlm-xnli15-1024-pytorch_model.binz>https://cdn.huggingface.co/xlm-clm-enfr-1024-pytorch_model.binz>https://cdn.huggingface.co/xlm-clm-ende-1024-pytorch_model.binz<https://cdn.huggingface.co/xlm-mlm-17-1280-pytorch_model.binz=https://cdn.huggingface.co/xlm-mlm-100-1280-pytorch_model.bin)
zxlm-mlm-en-2048zxlm-mlm-ende-1024zxlm-mlm-enfr-1024zxlm-mlm-enro-1024zxlm-mlm-tlm-xnli15-1024zxlm-mlm-xnli15-1024zxlm-clm-enfr-1024zxlm-clm-ende-1024zxlm-mlm-17-1280zxlm-mlm-100-1280c              	      s   t  fddt| D }tt |d d dd df |d d dd df< tt |d d dd df |d d dd df< |  d|_d S )Nc                    s$   g | ]  fd dt D qS )c              	      s(   g | ] }t d d|d     qS )i'     )nppower).0j)dimpos =/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_xlm.py
<listcomp>4   s     z;create_sinusoidal_embeddings.<locals>.<listcomp>.<listcomp>)range)r   r   )r   r   r   4   s     z0create_sinusoidal_embeddings.<locals>.<listcomp>r   r   r   F)	r   arrayr   torchZFloatTensorsincosZdetach_Zrequires_grad)Zn_posr   outZposition_encr   r   r   create_sinusoidal_embeddings3   s
    44r    c                 C   s   t j| t j|jd}|dk	r"|}n(|  | ks6t||dddf k }|d}|r|ddddf || d|ddddf k}n|}| || fkst|dks| || | fkst||fS )zH
    Generate hidden states mask, and optionally an attention mask.
    dtypedeviceNr   r   F)	r   arangelongr#   maxitemAssertionErrorsizerepeat)slenlengthscausalpadding_maskalenmaskbs	attn_maskr   r   r   	get_masks;   s    
0r3   c                       s6   e Zd Ze Z fddZdd ZdddZ  Z	S )	MultiHeadAttentionc                    s   t    ttj| _|j| _|| _|| _|j	| _
| j| j dksFtt||| _t||| _t||| _t||| _t | _d S )Nr   )super__init__nextr4   NEW_IDlayer_idoutput_attentionsr   n_headsattention_dropoutdropoutr(   r   Linearq_link_linv_linout_linsetpruned_heads)selfr;   r   config	__class__r   r   r6   X   s    
zMultiHeadAttention.__init__c                    s   | j | j }t|dkrd S t| j|}t|| j }|D ](  t fdd| jD 8  d| < q<|d	 
d}tt||  }t| j|| _t| j|| _t| j|| _t| j|dd| _| jt| | _|| j | _ | j|| _d S )Nr   c                 3   s   | ]}| k rd ndV  qdS )r   r   Nr   )r   hheadr   r   	<genexpr>n   s     z1MultiHeadAttention.prune_heads.<locals>.<genexpr>r   r   )r   r;   lenr   ZonesrC   rD   sumview
contiguouseqr$   r%   r   r?   r@   rA   rB   union)rE   headsZattention_head_sizer0   indexr   rJ   r   prune_headsg   s"    
zMultiHeadAttention.prune_headsNc                    s$  |  \ }}|dkr0|dkr"|n
|d | }n
| d}j}	j|	 | dkrb d||fn
 dd|f}
 fdd} fdd}||}|dkr||}||}n6|dksԈj|kr| }}||}||}|dk	rhj|krZ|dkrL|j \}}tj||gd	d
}tj||gd	d
}n|j \}}||f|j< |t	
 }t||d	d}|dk|
|}||td  tj| dd
|}tj|jjd}|dk	r|| }t||}||}|f}jr ||f }|S )zd
        Self-attention (if kv is None) or attention over source sentence (provided by kv).
        Nr+   r      c                    s   |   djddS )z  projection rM   r   r   )rP   r;   	transposexr1   Zdim_per_headrE   r   r   shape   s    z)MultiHeadAttention.forward.<locals>.shapec                    s    |  dd  dj S )z  compute context r   r   rM   )rX   rQ   rP   r;   rY   r[   r   r   unshape   s    z+MultiHeadAttention.forward.<locals>.unshaper   r   r   infrM   ptraining)r)   r;   r   r?   r@   rA   r9   r   catmathsqrtmatmulrX   rP   	expand_asZmasked_fill_floatFZsoftmaxtype_asr=   ra   rB   r:   )rE   inputr0   kvcache	head_maskqlenr   Zklenr;   Zmask_reshaper\   r]   qkvZk_Zv_scoresweightscontextoutputsr   r[   r   forward|   sN    

$



zMultiHeadAttention.forward)NNN)
__name__
__module____qualname__	itertoolscountr8   r6   rV   rv   __classcell__r   r   rG   r   r4   T   s   r4   c                       s$   e Zd Z fddZdd Z  ZS )TransformerFFNc                    sD   t    |j| _t||| _t||| _|jr8tnt	j
| _d S N)r5   r6   r=   r   r>   lin1lin2Zgelu_activationr   rh   Zreluact)rE   Zin_dimZ
dim_hiddenZout_dimrF   rG   r   r   r6      s
    
zTransformerFFN.__init__c                 C   s6   |  |}| |}| |}tj|| j| jd}|S )Nr_   )r   r   r   rh   r=   ra   )rE   rj   rZ   r   r   r   rv      s
    


zTransformerFFN.forward)rw   rx   ry   r6   rv   r|   r   r   rG   r   r}      s   r}   c                       sD   e Zd ZdZeZeZdZdZ	 fddZ
edd Zdd	 Z  ZS )
XLMPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    Ntransformerc                    s   t  j|| d S r~   )r5   r6   )rE   inputskwargsrG   r   r   r6      s    zXLMPreTrainedModel.__init__c              	   C   s   t dddddgdddddgdddddgg}t dddddgdddddgdddddgg}| jjr| jjdkrt dddddgdddddgdddddgg}nd }|||d	S )
N      r   r   r   rW         )	input_idsattention_masklangs)r   tensorrF   use_lang_embn_langs)rE   Zinputs_listZ
attns_listZ
langs_listr   r   r   dummy_inputs   s    ..0zXLMPreTrainedModel.dummy_inputsc                 C   s   t |tjr:| jdk	r:| jjdk	r:tjj|jd| jjd t |tjr| jdk	r| jj	dk	rtjj|jd| jj	d t
|dr|jdk	rtj|jd t |tjr|jj  |jjd dS )z Initialize the weights. Nr   )ZmeanZstdbiasg        g      ?)
isinstancer   	EmbeddingrF   Zembed_init_stdinitZnormal_weightr>   Zinit_stdhasattrr   Z	constant_	LayerNormdataZzero_Zfill_)rE   moduler   r   r   _init_weights   s    z XLMPreTrainedModel._init_weights)rw   rx   ry   __doc__r   Zconfig_class XLM_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZload_tf_weightsZbase_model_prefixr6   propertyr   r   r|   r   r   rG   r   r      s   
	r   am  

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a5  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        langs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            A parallel sequence of tokens to be used to indicate the language of each token in the input.
            Indices are languages ids which can be obtained from the language names by using two conversion mappings
            provided in the configuration of the model (only provided for multilingual models).
            More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
            the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).

            See usage examples detailed in the `multilingual documentation <https://huggingface.co/transformers/multilingual.html>`__.
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Length of each sentence that can be used to avoid performing attention on padding token indices.
            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
            Indices selected in ``[0, ..., input_ids.size(-1)]``:
        cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`, defaults to :obj:`None`):
            dictionary with ``torch.FloatTensor`` that contains pre-computed
            hidden-states (key and values in the attention blocks) as computed by the model
            (see `cache` output below). Can be used to speed up sequential decoding.
            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
z]The bare XLM Model transformer outputting raw hidden-states without any specific head on top.c                
       sF   e Zd Z fddZdd Zdd Zdd Zeedd
dZ	  Z
S )XLMModelc              	      sN  t  | |j| _|j| _|j| _|j | _| jr<td|j| _|j| _|j	| _	|j
| _
|j| _|j| _|j| _| jd | _|j| _|j| _|j| _|j| _| j| j dkstdt|j| j| _|jrt|j| j| jjd |jdkr|j	rt| j| j| _tj| j
| j| jd| _tj| j|jd| _ t! | _"t! | _#t! | _$t! | _%t&| jD ]r}| j"'t(| j| j|d	 | j#'tj| j|jd | j$'t)| j| j| j|d	 | j%'tj| j|jd qjt*|d
rB|j+, - }i |_+|D ]>\}}| j"t.| j|jkr| /t.|t0t1t.|i q| 2  d S )Nz,Currently XLM can only be used as an encoderr   r   z-transformer dim must be a multiple of n_heads)r   r   )Zpadding_idx)Zeps)rF   rD   )3r5   r6   r:   output_hidden_statesZ
is_encoderZ
is_decoderNotImplementedErrorr-   r   r   n_wordsZ	eos_index	pad_indexemb_dimr   Z
hidden_dimr;   n_layersr=   r<   r(   r   r   Zmax_position_embeddingsposition_embeddingsZsinusoidal_embeddingsr    r   lang_embeddings
embeddingsr   Zlayer_norm_epslayer_norm_embZ
ModuleList
attentionslayer_norm1ffnslayer_norm2r   appendr4   r}   r   rD   copyitemsintrV   listmapinit_weights)rE   rF   _rD   layerrT   rG   r   r   r6   :  sV    




 zXLMModel.__init__c                 C   s   | j S r~   r   rE   r   r   r   get_input_embeddings  s    zXLMModel.get_input_embeddingsc                 C   s
   || _ d S r~   r   )rE   Znew_embeddingsr   r   r   set_input_embeddings  s    zXLMModel.set_input_embeddingsc                 C   s&   |  D ]\}}| j| | qdS )z Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        N)r   r   rV   )rE   Zheads_to_pruner   rT   r   r   r   _prune_heads  s    zXLMModel._prune_headsNc
                 C   s  |dk	r|  \}
}n|	  dd \}
}|dkrb|dk	rR|| jkjdd }nt|g|
 }| d|
kstt|  |kstt	||| j
|d\}}|dk	r|jn|	j}|dkrtj|tj|d}|d|
|f}n|  |
|fkst|dk	r|  |
|fkst| || jj}|dk	r|dk	r||d  }|dd| df }|dd| df }|dk	r|dd| df }|dd| df }|dd| df }|	dkr| |}	|	| ||	 }|dk	r| jr| jdkr|| | }|dk	r*|| | }| |}tj|| j| jd	}||d|j9 }d
}d
}t| jD ]}| jr||f }| j | ||||| d}|d }| j!r||d f }tj|| j| jd	}|| }| j"| |}|| j#| | }| j$| |}||d|j9 }qp| jr4||f }|dk	rT|d  | d7  < |f}| jrl||f }| j!r~||f }|S )a  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLMTokenizer, XLMModel
        import torch

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMModel.from_pretrained('xlm-mlm-en-2048')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        NrM   r   r   r   )r.   r!   r+   r_   r   )rl   rm   )%r)   r   rO   r%   r   Z
LongTensorr(   r&   r'   r3   r-   r#   r$   Z	unsqueezeexpandZget_head_maskrF   r   r   r   rf   r   r   r   r   rh   r=   ra   tor"   r   r   r   r:   r   r   r   )rE   r   r   r   token_type_idsposition_idsr,   rl   rm   inputs_embedsr1   r+   r0   r2   r#   Z_slenr   Zhidden_statesr   iZattn_outputsZattnru   r   r   r   rv     sz    *











zXLMModel.forward)	NNNNNNNNN)rw   rx   ry   r6   r   r   r   r
   XLM_INPUTS_DOCSTRINGrv   r|   r   r   rG   r   r   5  s   F         r   c                       s*   e Zd ZdZ fddZdddZ  ZS )XLMPredLayerz?
    Prediction layer (cross_entropy or adaptive_softmax).
    c                    sh   t    |j| _|j| _|j| _|j}|jdkrHtj||jdd| _ntj	||j|j
|jdd| _d S )NFT)r   )Zin_featuresZ	n_classesZcutoffsZ	div_valueZ	head_bias)r5   r6   asmr   r   r   r   r>   projZAdaptiveLogSoftmaxWithLossZasm_cutoffsZasm_div_value)rE   rF   r   rG   r   r   r6   /  s    

zXLMPredLayer.__init__Nc                 C   s   d}| j dkrV| |}|f| }|dk	rtj|d| j|ddd}|f| }n8| j|}|f| }|dk	r| ||\}}|f| }|S )z6 Compute the loss, and optionally the scores.
        r   FNrM   Zelementwise_mean)Z	reduction)r   r   rh   Zcross_entropyrP   r   Zlog_prob)rE   rZ   yru   rr   lossr   r   r   r   rv   A  s    


 

zXLMPredLayer.forward)N)rw   rx   ry   r   r6   rv   r|   r   r   rG   r   r   *  s   r   z}The XLM Model transformer with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                       s>   e Zd Z fddZdd Zdd Zeed
dd	Z  Z	S )XLMWithLMHeadModelc                    s,   t  | t|| _t|| _|   d S r~   )r5   r6   r   r   r   
pred_layerr   rE   rF   rG   r   r   r6   [  s    

zXLMWithLMHeadModel.__init__c                 C   s   | j jS r~   )r   r   r   r   r   r   get_output_embeddingsb  s    z(XLMWithLMHeadModel.get_output_embeddingsc                 K   sj   | j j}| j j}|jd }tj|df|tj|jd}tj||gdd}|d k	r\t	||}nd }||dS )Nr   r   r!   r   )r   r   )
rF   mask_token_idlang_idr\   r   fullr%   r#   rb   Z	full_like)rE   r   r   r   r   Zeffective_batch_sizeZ
mask_tokenr   r   r   r   prepare_inputs_for_generatione  s    
z0XLMWithLMHeadModel.prepare_inputs_for_generationNc                 C   sD   | j |||||||||	d	}|d }| ||
}||dd  }|S )aH	  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored (masked), the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
            Language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLMTokenizer, XLMWithLMHeadModel
        import torch

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        r   r   r   r   r,   rl   rm   r   r   r   N)r   r   )rE   r   r   r   r   r   r,   rl   rm   r   labelstransformer_outputsoutputru   r   r   r   rv   r  s    4zXLMWithLMHeadModel.forward)
NNNNNNNNNN)
rw   rx   ry   r6   r   r   r
   r   rv   r|   r   r   rG   r   r   U  s             r   zXLM Model with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s.   e Zd Z fddZeedddZ  ZS )XLMForSequenceClassificationc                    s4   t  | |j| _t|| _t|| _|   d S r~   )r5   r6   
num_labelsr   r   r   sequence_summaryr   r   rG   r   r   r6     s
    

z%XLMForSequenceClassification.__init__Nc                 C   s   | j |||||||||	d	}|d }| |}|f|dd  }|
dk	r| jdkrpt }||d|
d}n t }||d| j|
d}|f| }|S )a5	  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLMTokenizer, XLMForSequenceClassification
        import torch

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

        r   r   r   NrM   )r   r   r   r   rP   r   )rE   r   r   r   r   r   r,   rl   rm   r   r   r   r   logitsru   loss_fctr   r   r   r   rv     s,    4


z$XLMForSequenceClassification.forward)
NNNNNNNNNNrw   rx   ry   r6   r
   r   rv   r|   r   r   rG   r   r     s   	          r   zXLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s.   e Zd Z fddZeedddZ  ZS )XLMForQuestionAnsweringSimplec                    s4   t  | t|| _t|j|j| _| 	  d S r~   )
r5   r6   r   r   r   r>   hidden_sizer   
qa_outputsr   r   rG   r   r   r6      s    
z&XLMForQuestionAnsweringSimple.__init__Nc                 C   s
  | j |||||||||	d	}|d }| |}|jddd\}}|d}|d}||f}|
dk	r|dk	rt|
 dkr|
d}
t| dkr|d}|d}|
d| |d| t|d}|||
}|||}|| d }|f| }||dd  }|S )	at  
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-start scores (before SoftMax).
        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLMTokenizer, XLMForQuestionAnsweringSimple
        import torch

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]

        r   r   r   rM   r   N)ignore_indexr   )r   r   splitZsqueezerN   r)   Zclamp_r   )rE   r   r   r   r   r   r,   rl   rm   r   start_positionsend_positionsr   sequence_outputr   Zstart_logitsZ
end_logitsru   Zignored_indexr   Z
start_lossZend_lossZ
total_lossr   r   r   rv   (  sD    ;









z%XLMForQuestionAnsweringSimple.forward)NNNNNNNNNNNr   r   r   rG   r   r     s              r   zXLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s.   e Zd Z fddZeedddZ  ZS )XLMForQuestionAnsweringc                    s,   t  | t|| _t|| _|   d S r~   )r5   r6   r   r   r   r   r   r   rG   r   r   r6     s    

z XLMForQuestionAnswering.__init__Nc                 C   sN   | j |||||||||	d	}|d }| j||
||||d}||dd  }|S )a  
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
            Labels whether a question has an answer or no answer (SQuAD 2.0)
        cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
        p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
            1.0 means token should be masked. 0.0 mean token is not masked.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
        start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
        end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Log probabilities for the ``is_impossible`` label of the answers.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLMTokenizer, XLMForQuestionAnswering
        import torch

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]

        r   r   )r   r   	cls_indexis_impossiblep_maskr   N)r   r   )rE   r   r   r   r   r   r,   rl   rm   r   r   r   r   r   r   r   r   ru   r   r   r   rv     s,    K	zXLMForQuestionAnswering.forward)NNNNNNNNNNNNNNr   r   r   rG   r   r     s"                 r   zXLM Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. c                       s.   e Zd Z fddZeedddZ  ZS )XLMForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t|j	|j| _
|   d S r~   )r5   r6   r   r   r   r   ZDropoutr=   r>   r   
classifierr   r   rG   r   r   r6     s    
z"XLMForTokenClassification.__init__Nc                 C   s   | j ||||||d}|d }	| |	}	| |	}
|
f|dd  }|dk	rt }|dk	r|ddk}|
d| j}t||dt|j	
|}|||}n||
d| j|d}|f| }|S )a,  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the token classification loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
            Classification loss.
        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import XLMTokenizer, XLMForTokenClassification
        import torch

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
        model = XLMForTokenClassification.from_pretrained('xlm-mlm-100-1280')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, scores = outputs[:2]

        )r   r   r   r   rm   r   r   NrM   r   )r   r=   r   r   rP   r   r   wherer   r   ri   )rE   r   r   r   r   r   rm   r   ru   r   r   r   Zactive_lossZactive_logitsZactive_labelsr   r   r   r   rv     s4    /	

  
z!XLMForTokenClassification.forward)NNNNNNNr   r   r   rG   r   r     s   
       r   )N),r   rz   loggingrc   Znumpyr   r   r   Ztorch.nnr   r   r   rh   Zactivationsr   Zconfiguration_xlmr   Z
file_utilsr	   r
   Zmodeling_utilsr   r   r   r   	getLoggerrw   loggerr   r    r3   Moduler4   r}   r   ZXLM_START_DOCSTRINGr   r   r   r   r   r   r   r   r   r   r   r   <module>   s~   

h&7 r+_\qp