U
    &cޗ                     @   s  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZmZ ddlmZmZ eeZd	d
ddddddZdd ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZdZ d Z!ed!e G d"d# d#eZ"ed$e G d%d& d&eZ#ed'e G d(d) d)eZ$ed*e G d+d, d,eZ%ed-e G d.d/ d/eZ&dS )0z PyTorch DistilBERT model
    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
    and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
    N)CrossEntropyLoss   )gelu)DistilBertConfig)add_start_docstrings add_start_docstrings_to_callable)PreTrainedModelprune_linear_layerzDhttps://cdn.huggingface.co/distilbert-base-uncased-pytorch_model.binzThttps://cdn.huggingface.co/distilbert-base-uncased-distilled-squad-pytorch_model.binzBhttps://cdn.huggingface.co/distilbert-base-cased-pytorch_model.binzRhttps://cdn.huggingface.co/distilbert-base-cased-distilled-squad-pytorch_model.binzIhttps://cdn.huggingface.co/distilbert-base-german-cased-pytorch_model.binzOhttps://cdn.huggingface.co/distilbert-base-multilingual-cased-pytorch_model.binz\https://cdn.huggingface.co/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin)zdistilbert-base-uncasedz'distilbert-base-uncased-distilled-squadzdistilbert-base-casedz%distilbert-base-cased-distilled-squadzdistilbert-base-german-casedz"distilbert-base-multilingual-casedz/distilbert-base-uncased-finetuned-sst-2-englishc              	      s   t  fddt| D }tt |d d dd df |d d dd df< tt |d d dd df |d d dd df< |  d|_d S )Nc                    s$   g | ]  fd dt D qS )c              	      s(   g | ] }t d d|d     qS )i'     )nppower).0j)dimpos D/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_distilbert.py
<listcomp>6   s     z;create_sinusoidal_embeddings.<locals>.<listcomp>.<listcomp>)range)r   r   )r   r   r   6   s     z0create_sinusoidal_embeddings.<locals>.<listcomp>r   r
   r   F)	r   arrayr   torchZFloatTensorsincosZdetach_requires_grad)n_posr   outZposition_encr   r   r   create_sinusoidal_embeddings5   s
    44r   c                       s$   e Zd Z fddZdd Z  ZS )
Embeddingsc                    st   t    tj|j|j|jd| _t|j|j| _	|j
rPt|j|j| j	jd tj|jdd| _t|j| _d S )N)Zpadding_idx)r   r   r   -q=eps)super__init__nn	Embedding
vocab_sizer   Zpad_token_idword_embeddingsZmax_position_embeddingsposition_embeddingsZsinusoidal_pos_embdsr   weight	LayerNormDropoutdropoutselfconfig	__class__r   r   r#   >   s    
  zEmbeddings.__init__c                 C   sb   | d}tj|tj|jd}|d|}| |}| |}|| }| 	|}| 
|}|S )a7  
        Parameters
        ----------
        input_ids: torch.tensor(bs, max_seq_length)
            The token ids to embed.

        Outputs
        -------
        embeddings: torch.tensor(bs, max_seq_length, dim)
            The embedded tokens (plus position embeddings, no token_type embeddings)
        r   )Zdtypedevicer   )sizer   arangelongr2   Z	unsqueeze	expand_asr'   r(   r*   r,   )r.   	input_idsZ
seq_lengthZposition_idsr'   r(   
embeddingsr   r   r   forwardJ   s    




zEmbeddings.forward__name__
__module____qualname__r#   r9   __classcell__r   r   r0   r   r   =   s   r   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	MultiHeadSelfAttentionc                    s   t    |j| _|j| _tj|jd| _|j| _| j| j dksFt	tj
|j|jd| _tj
|j|jd| _tj
|j|jd| _tj
|j|jd| _t | _d S )Npr   Zin_featuresZout_features)r"   r#   n_headsr   r$   r+   Zattention_dropoutr,   output_attentionsAssertionErrorLinearq_link_linv_linout_linsetpruned_headsr-   r0   r   r   r#   d   s    
zMultiHeadSelfAttention.__init__c                    s   | j | j }t|dkrd S t| j|}t|| j }|D ](  t fdd| jD 8  d| < q<|d	 
d}tt||  }t| j|| _t| j|| _t| j|| _t| j|dd| _| jt| | _|| j | _ | j|| _d S )Nr   c                 3   s   | ]}| k rd ndV  qdS )r   r   Nr   )r   hheadr   r   	<genexpr>|   s     z5MultiHeadSelfAttention.prune_heads.<locals>.<genexpr>r   r   )r   rC   lenr   onesrK   rL   sumview
contiguouseqr4   r5   r	   rG   rH   rI   rJ   union)r.   headsZattention_head_sizemaskindexr   rN   r   prune_headsu   s"    
z"MultiHeadSelfAttention.prune_headsNc                    s$  |  \ }}| d}jj  dd|f}	 fdd}
 fdd}|
|}|
|}|
|}|t }t	||
dd}|dk|	|}||td	  tjd
d|}|}|dk	r|| }t	||}||}|}jr||fS |fS dS )a  
        Parameters
        ----------
        query: torch.tensor(bs, seq_length, dim)
        key: torch.tensor(bs, seq_length, dim)
        value: torch.tensor(bs, seq_length, dim)
        mask: torch.tensor(bs, seq_length)

        Outputs
        -------
        weights: torch.tensor(bs, n_heads, seq_length, seq_length)
            Attention weights
        context: torch.tensor(bs, seq_length, dim)
            Contextualized layer. Optional: only if `output_attentions=True`
        r   c                    s   |   djddS )z separate heads rQ   r   r
   )rU   rC   	transposexbsZdim_per_headr.   r   r   shape   s    z-MultiHeadSelfAttention.forward.<locals>.shapec                    s    |  dd  dj S )z group heads r   r
   rQ   )r]   rV   rU   rC   r^   r`   r   r   unshape   s    z/MultiHeadSelfAttention.forward.<locals>.unshaper
      r   infrQ   r   N)r3   r   rC   rG   rH   rI   mathsqrtr   matmulr]   rU   r6   Zmasked_fill_floatr$   ZSoftmaxr,   rJ   rD   )r.   querykeyvaluerZ   	head_maskZq_lengthr   Zk_lengthZ
mask_reshprb   rc   qkvZscoresweightscontextr   r`   r   r9      s.    


zMultiHeadSelfAttention.forward)N)r;   r<   r=   r#   r\   r9   r>   r   r   r0   r   r?   c   s   r?   c                       s$   e Zd Z fddZdd Z  ZS )FFNc                    sx   t    tj|jd| _tj|j|jd| _tj|j|jd| _	|j
dks\td|j
|j
dkrjtnt | _
d S )Nr@   rB   )Zrelur   z+activation ({}) must be in ['relu', 'gelu']r   )r"   r#   r$   r+   r,   rF   r   Z
hidden_dimlin1lin2
activationrE   formatr   ReLUr-   r0   r   r   r#      s    
zFFN.__init__c                 C   s,   |  |}| |}| |}| |}|S N)rt   rv   ru   r,   )r.   inputr_   r   r   r   r9      s
    



zFFN.forwardr:   r   r   r0   r   rs      s   
rs   c                       s&   e Zd Z fddZdddZ  ZS )TransformerBlockc                    sb   t    |j| _|j|j dks&tt|| _tj	|jdd| _
t|| _tj	|jdd| _d S )Nr   r   )Znormalized_shaper!   )r"   r#   rD   r   rC   rE   r?   	attentionr$   r*   sa_layer_normrs   ffnoutput_layer_normr-   r0   r   r   r#      s    


zTransformerBlock.__init__Nc                 C   s|   | j |||||d}| jr$|\}}nt|tks4t|d }| || }| |}| || }|f}| jrx|f| }|S )a  
        Parameters
        ----------
        x: torch.tensor(bs, seq_length, dim)
        attn_mask: torch.tensor(bs, seq_length)

        Outputs
        -------
        sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length)
            The attention weights
        ffn_output: torch.tensor(bs, seq_length, dim)
            The output of the transformer block contextualization.
        )rj   rk   rl   rZ   rm   r   )r|   rD   typetuplerE   r}   r~   r   )r.   r_   	attn_maskrm   Z	sa_outputZ
sa_weightsZ
ffn_outputoutputr   r   r   r9      s    


zTransformerBlock.forward)NNr:   r   r   r0   r   r{      s   r{   c                       s&   e Zd Z fddZdddZ  ZS )Transformerc                    sN   t    |j| _|j| _|j| _t| t fddt|jD | _	d S )Nc                    s   g | ]}t  qS r   )copydeepcopy)r   _layerr   r   r     s     z(Transformer.__init__.<locals>.<listcomp>)
r"   r#   Zn_layersrD   output_hidden_statesr{   r$   Z
ModuleListr   r   r-   r0   r   r   r#     s    
zTransformer.__init__Nc                 C   s   d}d}|}t | jD ]l\}}| jr.||f }||||| d}	|	d }| jrrt|	dks^t|	d }
||
f }qt|	dkstq| jr||f }|f}| jr||f }| jr||f }|S )a  
        Parameters
        ----------
        x: torch.tensor(bs, seq_length, dim)
            Input sequence embedded.
        attn_mask: torch.tensor(bs, seq_length)
            Attention mask on the sequence.

        Outputs
        -------
        hidden_state: torch.tensor(bs, seq_length, dim)
            Sequence of hiddens states in the last (top) layer
        all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
            Tuple of length n_layers with the hidden states from each layer.
            Optional: only if output_hidden_states=True
        all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
            Tuple of length n_layers with the attention weights from each layer
            Optional: only if output_attentions=True
        r   r_   r   rm   rQ   r
   r   r   )	enumerater   r   rD   rR   rE   )r.   r_   r   rm   Zall_hidden_statesZall_attentionshidden_stateiZlayer_moduleZlayer_outputsZ
attentionsoutputsr   r   r   r9     s*    



zTransformer.forward)NNr:   r   r   r0   r   r     s   	r   c                   @   s(   e Zd ZdZeZeZdZdZ	dd Z
dS )DistilBertPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    N
distilbertc                 C   s   t |tjr*|jjr*|jjjd| jjd t |tj	rN|jjjd| jjd n&t |tj
rt|jj  |jjd t |tj	r|jdk	r|jj  dS )z! Initialize the weights.
        g        )ZmeanZstdg      ?N)
isinstancer$   r%   r)   r   dataZnormal_r/   Zinitializer_rangerF   r*   ZbiasZzero_Zfill_)r.   moduler   r   r   _init_weightsN  s    z'DistilBertPreTrainedModel._init_weights)r;   r<   r=   __doc__r   Zconfig_class'DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZload_tf_weightsZbase_model_prefixr   r   r   r   r   r   D  s   r   at  

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.DistilBertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
zfThe bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.c                       sF   e Zd Z fddZdd Zdd Zdd Zeedd
dZ	  Z
S )DistilBertModelc                    s,   t  | t|| _t|| _|   d S ry   )r"   r#   r   r8   r   transformerinit_weightsr-   r0   r   r   r#     s    

zDistilBertModel.__init__c                 C   s   | j jS ry   r8   r'   r.   r   r   r   get_input_embeddings  s    z$DistilBertModel.get_input_embeddingsc                 C   s   || j _d S ry   r   )r.   Znew_embeddingsr   r   r   set_input_embeddings  s    z$DistilBertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        N)itemsr   r   r|   r\   )r.   Zheads_to_pruner   rY   r   r   r   _prune_heads  s    zDistilBertModel._prune_headsNc           
      C   s   |dk	r|dk	rt dn4|dk	r,| }n"|dk	rF| dd }nt d|dk	r\|jn|j}|dkrxtj||d}| || jj}|dkr| |}| j	|||d}|d }|f|dd  }	|	S )	a  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import DistilBertTokenizer, DistilBertModel
        import torch

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = DistilBertModel.from_pretrained('distilbert-base-cased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)

        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        NzDYou cannot specify both input_ids and inputs_embeds at the same timerQ   z5You have to specify either input_ids or inputs_embeds)r2   r   r   r   )

ValueErrorr3   r2   r   rS   Zget_head_maskr/   Znum_hidden_layersr8   r   )
r.   r7   attention_maskrm   inputs_embedsZinput_shaper2   Ztfmr_outputr   r   r   r   r   r9     s"    !


zDistilBertModel.forward)NNNN)r;   r<   r=   r#   r   r   r   r   DISTILBERT_INPUTS_DOCSTRINGr9   r>   r   r   r0   r   r     s   r   z@DistilBert Model with a `masked language modeling` head on top. c                       s6   e Zd Z fddZdd ZeedddZ  ZS )	DistilBertForMaskedLMc                    sr   t  | |j| _|j| _t|| _t|j|j| _	tj
|jdd| _t|j|j| _|   t | _d S )Nr   r    )r"   r#   rD   r   r   r   r$   rF   r   vocab_transformr*   vocab_layer_normr&   vocab_projectorr   r   mlm_loss_fctr-   r0   r   r   r#     s    
zDistilBertForMaskedLM.__init__c                 C   s   | j S ry   )r   r   r   r   r   get_output_embeddings  s    z+DistilBertForMaskedLM.get_output_embeddingsNc                 C   s   | j ||||d}|d }| |}t|}| |}| |}|f|dd  }	|dk	r| |d|d|d}
|
f|	 }	|	S )aI	  
        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the masked language modeling loss.
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
            in ``[0, ..., config.vocab_size]``

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Masked language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import DistilBertTokenizer, DistilBertForMaskedLM
        import torch

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, masked_lm_labels=input_ids)
        loss, prediction_scores = outputs[:2]

        r7   r   rm   r   r   r   NrQ   )r   r   r   r   r   r   rU   r3   )r.   r7   r   rm   r   Zmasked_lm_labelsZdlbrt_outputhidden_statesZprediction_logitsr   Zmlm_lossr   r   r   r9     s&    '   


 
zDistilBertForMaskedLM.forward)NNNNN)	r;   r<   r=   r#   r   r   r   r9   r>   r   r   r0   r   r     s   r   zDistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s.   e Zd Z fddZeedddZ  ZS )#DistilBertForSequenceClassificationc                    s\   t  | |j| _t|| _t|j|j| _t|j|j| _	t
|j| _|   d S ry   )r"   r#   
num_labelsr   r   r$   rF   r   pre_classifier
classifierr+   Zseq_classif_dropoutr,   r   r-   r0   r   r   r#   /  s    
z,DistilBertForSequenceClassification.__init__Nc                 C   s   | j ||||d}|d }|dddf }| |}t |}| |}| |}	|	f|dd  }
|dk	r| jdkrt }||	d|d}n"t	 }||	d| j|d}|f|
 }
|
S )ad	  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
        import torch

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

        r   r   Nr   rQ   )
r   r   r$   rx   r,   r   r   ZMSELossrU   r   )r.   r7   r   rm   r   labelsdistilbert_outputr   Zpooled_outputlogitsr   loss_fctlossr   r   r   r9   :  s*    (   




z+DistilBertForSequenceClassification.forward)NNNNNr;   r<   r=   r#   r   r   r9   r>   r   r   r0   r   r   )  s   r   zDistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s.   e Zd Z fddZeedddZ  ZS )DistilBertForQuestionAnsweringc                    sP   t  | t|| _t|j|j| _|jdks6t	t
|j| _|   d S )Nr
   )r"   r#   r   r   r$   rF   r   r   
qa_outputsrE   r+   Z
qa_dropoutr,   r   r-   r0   r   r   r#     s    
z'DistilBertForQuestionAnswering.__init__Nc                 C   s  | j ||||d}|d }| |}| |}	|	jddd\}
}|
d}
|d}|
|f|dd  }|dk	r|dk	rt| dkr|d}t| dkr|d}|
d}|d| |d| tj	|d}||
|}|||}|| d }|f| }|S )	a  
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-start scores (before SoftMax).
        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
        import torch

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
        loss, start_scores, end_scores = outputs[:3]

        r   r   r   rQ   r   N)ignore_indexr
   )
r   r,   r   splitZsqueezerR   r3   Zclamp_r$   r   )r.   r7   r   rm   r   Zstart_positionsZend_positionsr   r   r   Zstart_logitsZ
end_logitsr   Zignored_indexr   Z
start_lossZend_lossZ
total_lossr   r   r   r9     s6    6   









z&DistilBertForQuestionAnswering.forward)NNNNNNr   r   r   r0   r   r   y  s   
      r   zDistilBert Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. c                       s.   e Zd Z fddZeedddZ  ZS ) DistilBertForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t|j	|j| _
|   d S ry   )r"   r#   r   r   r   r$   r+   r,   rF   Zhidden_sizer   r   r-   r0   r   r   r#     s    
z)DistilBertForTokenClassification.__init__Nc                 C   s   | j ||||d}|d }| |}| |}|f|dd  }|dk	rt }	|dk	r|ddk}
|d| j}t|
|dt|	j	
|}|	||}n|	|d| j|d}|f| }|S )aY  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the token classification loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
            Classification loss.
        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import DistilBertTokenizer, DistilBertForTokenClassification
        import torch

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, scores = outputs[:2]

        )r   rm   r   r   r
   NrQ   r   )r   r,   r   r   rU   r   r   whereZtensorr   Ztype_as)r.   r7   r   rm   r   r   r   Zsequence_outputr   r   Zactive_lossZactive_logitsZactive_labelsr   r   r   r   r9     s0    '   

  
z(DistilBertForTokenClassification.forward)NNNNNr   r   r   r0   r   r     s   
r   )'r   r   loggingrf   Znumpyr   r   Ztorch.nnr$   r   Zactivationsr   Zconfiguration_distilbertr   Z
file_utilsr   r   Zmodeling_utilsr   r	   	getLoggerr;   loggerr   r   Moduler   r?   rs   r{   r   r   ZDISTILBERT_START_DOCSTRINGr   r   r   r   r   r   r   r   r   r   <module>   sl   
&b/=R LKa