U
    &c3}                     @   sl  d Z ddlZddlZddlmZ ddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZmZmZmZmZ ddlmZ eeZd	d
dddddZG dd deZdZdZe
deG dd deZe
deG dd deZG dd dejZe
deG dd deZe
deG d d! d!eZ e
d"eG d#d$ d$eZ!G d%d& d&ejZ"e
d'eG d(d) d)eZ#dS )*zPyTorch RoBERTa model.     N)CrossEntropyLossMSELoss   )RobertaConfig)add_start_docstrings add_start_docstrings_to_callable)BertEmbeddingsBertLayerNorm	BertModelBertPreTrainedModelgelu)"create_position_ids_from_input_idsz9https://cdn.huggingface.co/roberta-base-pytorch_model.binz:https://cdn.huggingface.co/roberta-large-pytorch_model.binz?https://cdn.huggingface.co/roberta-large-mnli-pytorch_model.binz?https://cdn.huggingface.co/distilroberta-base-pytorch_model.binzIhttps://cdn.huggingface.co/roberta-base-openai-detector-pytorch_model.binzJhttps://cdn.huggingface.co/roberta-large-openai-detector-pytorch_model.bin)zroberta-basezroberta-largezroberta-large-mnlizdistilroberta-basezroberta-base-openai-detectorzroberta-large-openai-detectorc                       s6   e Zd ZdZ fddZd	 fdd	Zdd Z  ZS )
RobertaEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    sH   t  | |j| _tj|j|j| jd| _tj|j	|j| jd| _
d S )N)padding_idx)super__init__Zpad_token_idr   nnZ	Embedding
vocab_sizehidden_sizeword_embeddingsZmax_position_embeddingsZposition_embeddingsselfconfig	__class__ A/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_roberta.pyr   0   s      zRobertaEmbeddings.__init__Nc                    sD   |d kr0|d k	r&t || j|j}n
| |}t j||||dS )N)token_type_idsposition_idsinputs_embeds)r   r   todevice&create_position_ids_from_inputs_embedsr   forward)r   	input_idsr   r   r   r   r   r   r#   8   s    
   zRobertaEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z We are provided embeddings directly. We cannot infer which are padded so just generate
        sequential position ids.

        :param torch.Tensor inputs_embeds:
        :return torch.Tensor:
        Nr   )Zdtyper!   r   )sizetorchZaranger   longr!   Z	unsqueezeexpand)r   r   Zinput_shapeZsequence_lengthr   r   r   r   r"   D   s       z8RobertaEmbeddings.create_position_ids_from_inputs_embeds)NNNN)__name__
__module____qualname____doc__r   r#   r"   __classcell__r   r   r   r   r   +   s   r   aq  

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a	  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.RobertaTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
zaThe bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.c                       s<   e Zd ZdZeZeZdZ fddZ	dd Z
dd Z  ZS )	RobertaModelz
    This class overrides :class:`~transformers.BertModel`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    robertac                    s"   t  | t|| _|   d S N)r   r   r   
embeddingsinit_weightsr   r   r   r   r      s    
zRobertaModel.__init__c                 C   s   | j jS r1   r2   r   r   r   r   r   get_input_embeddings   s    z!RobertaModel.get_input_embeddingsc                 C   s   || j _d S r1   r4   )r   valuer   r   r   set_input_embeddings   s    z!RobertaModel.set_input_embeddings)r*   r+   r,   r-   r   config_class$ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAPpretrained_model_archive_mapbase_model_prefixr   r6   r8   r.   r   r   r   r   r/      s   r/   z6RoBERTa Model with a `language modeling` head on top. c                       sB   e Zd ZeZeZdZ fddZdd Z	e
ed	ddZ  ZS )
RobertaForMaskedLMr0   c                    s,   t  | t|| _t|| _|   d S r1   )r   r   r/   r0   RobertaLMHeadlm_headr3   r   r   r   r   r      s    

zRobertaForMaskedLM.__init__c                 C   s   | j jS r1   )r?   decoderr5   r   r   r   get_output_embeddings   s    z(RobertaForMaskedLM.get_output_embeddingsNc                 C   sr   | j ||||||d}|d }	| |	}
|
f|dd  }|dk	rnt }||
d| jj|d}|f| }|S )a2	  
        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the masked language modeling loss.
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
            in ``[0, ..., config.vocab_size]``

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Masked language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import RobertaTokenizer, RobertaForMaskedLM
        import torch

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaForMaskedLM.from_pretrained('roberta-base')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, masked_lm_labels=input_ids)
        loss, prediction_scores = outputs[:2]

        attention_maskr   r   	head_maskr   r      Nr%   )r0   r?   r   viewr   r   )r   r$   rC   r   r   rD   r   Zmasked_lm_labelsoutputssequence_outputZprediction_scoresloss_fctZmasked_lm_lossr   r   r   r#      s     0

zRobertaForMaskedLM.forward)NNNNNNN)r*   r+   r,   r   r9   r:   r;   r<   r   rA   r   ROBERTA_INPUTS_DOCSTRINGr#   r.   r   r   r   r   r=      s          r=   c                       s(   e Zd ZdZ fddZdd Z  ZS )r>   z*Roberta Head for masked language modeling.c                    sf   t    t|j|j| _t|j|jd| _tj|j|j	dd| _
tt|j	| _| j| j
_d S )N)ZepsF)bias)r   r   r   Linearr   denser	   Zlayer_norm_eps
layer_normr   r@   	Parameterr'   zerosrK   r   r   r   r   r      s    
zRobertaLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r1   )rM   r   rN   r@   r   featureskwargsxr   r   r   r#     s
    


zRobertaLMHead.forwardr*   r+   r,   r-   r   r#   r.   r   r   r   r   r>      s   r>   zRoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
    on top of the pooled output) e.g. for GLUE tasks. c                       s:   e Zd ZeZeZdZ fddZe	e
dddZ  ZS ) RobertaForSequenceClassificationr0   c                    s,   t  | |j| _t|| _t|| _d S r1   )r   r   
num_labelsr/   r0   RobertaClassificationHead
classifierr   r   r   r   r     s    
z)RobertaForSequenceClassification.__init__Nc                 C   s   | j ||||||d}|d }	| |	}
|
f|dd  }|dk	r| jdkrjt }||
d|d}n t }||
d| j|d}|f| }|S )aC	  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import RobertaTokenizer, RobertaForSequenceClassification
        import torch

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaForSequenceClassification.from_pretrained('roberta-base')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

        rB   r   rE   Nr   r%   )r0   rY   rW   r   rF   r   )r   r$   rC   r   r   rD   r   labelsrG   rH   logitsrI   lossr   r   r   r#   !  s&    1


z(RobertaForSequenceClassification.forward)NNNNNNNr*   r+   r,   r   r9   r:   r;   r<   r   r   rJ   r#   r.   r   r   r   r   rV     s          rV   zRoberta Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. c                       s:   e Zd ZeZeZdZ fddZe	e
dddZ  ZS )RobertaForMultipleChoicer0   c                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r   r   r/   r0   r   Dropouthidden_dropout_probdropoutrL   r   rY   r3   r   r   r   r   r   u  s
    
z!RobertaForMultipleChoice.__init__Nc                 C   s   |j d }|d|d}	|dk	r6|d|dnd}
|dk	rT|d|dnd}|dk	rr|d|dnd}| j|	|
|||d}|d }| |}| |}|d|}|f|dd  }|dk	rt }|||}|f| }|S )aK	  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided):
            Classification loss.
        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).

            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import RobertaTokenizer, RobertaForMultipleChoice
        import torch

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaForMultipleChoice.from_pretrained('roberta-base')
        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, classification_scores = outputs[:2]

        r   r%   N)r   r   rC   rD   rE   )shaperF   r&   r0   ra   rY   r   )r   r$   r   rC   rZ   r   rD   r   Znum_choicesZflat_input_idsZflat_position_idsZflat_token_type_idsZflat_attention_maskrG   Zpooled_outputr[   Zreshaped_logitsrI   r\   r   r   r   r#   ~  s,    3




z RobertaForMultipleChoice.forward)NNNNNNNr]   r   r   r   r   r^   k  s   	       r^   zRoberta Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. c                       s:   e Zd ZeZeZdZ fddZe	e
dddZ  ZS )RobertaForTokenClassificationr0   c                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r1   )r   r   rW   r/   r0   r   r_   r`   ra   rL   r   rY   r3   r   r   r   r   r     s    
z&RobertaForTokenClassification.__init__Nc                 C   s   | j ||||||d}|d }	| |	}	| |	}
|
f|dd  }|dk	rt }|dk	r|ddk}|
d| j}t||dt|j	
|}|||}n||
d| j|d}|f| }|S )aQ  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the token classification loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
            Classification loss.
        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import RobertaTokenizer, RobertaForTokenClassification
        import torch

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaForTokenClassification.from_pretrained('roberta-base')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, scores = outputs[:2]

        rB   r   rE   Nr%   r   )r0   ra   rY   r   rF   rW   r'   whereZtensorignore_indexZtype_as)r   r$   rC   r   r   rD   r   rZ   rG   rH   r[   rI   Zactive_lossZactive_logitsZactive_labelsr\   r   r   r   r#     s4    0	

  
z%RobertaForTokenClassification.forward)NNNNNNNr]   r   r   r   r   rc     s   
       rc   c                       s(   e Zd ZdZ fddZdd Z  ZS )rX   z-Head for sentence-level classification tasks.c                    s@   t    t|j|j| _t|j| _t|j|j	| _
d S r1   )r   r   r   rL   r   rM   r_   r`   ra   rW   out_projr   r   r   r   r   6  s    
z"RobertaClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S )Nr   )ra   rM   r'   tanhrf   rQ   r   r   r   r#   <  s    




z!RobertaClassificationHead.forwardrU   r   r   r   r   rX   3  s   rX   zRoberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s:   e Zd ZeZeZdZ fddZe	e
dddZ  ZS )RobertaForQuestionAnsweringr0   c                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r1   )
r   r   rW   r/   r0   r   rL   r   
qa_outputsr3   r   r   r   r   r   P  s
    
z$RobertaForQuestionAnswering.__init__Nc	                 C   s   | j ||||||d}	|	d }
| |
}|jddd\}}|d}|d}||f|	dd  }	|dk	r|dk	rt| dkr|d}t| dkr|d}|d}|d| |d| t|d}|||}|||}|| d }|f|	 }	|	S )	af  
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-start scores (before SoftMax).
        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        # The checkpoint roberta-large is not fine-tuned for question answering. Please see the
        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.

        from transformers import RobertaTokenizer, RobertaForQuestionAnswering
        import torch

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaForQuestionAnswering.from_pretrained('roberta-base')

        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
        input_ids = tokenizer.encode(question, text)
        start_scores, end_scores = model(torch.tensor([input_ids]))

        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])

        rB   r   r   r%   )ZdimrE   N)re   )r0   ri   splitZsqueezelenr&   Zclamp_r   )r   r$   rC   r   r   rD   r   Zstart_positionsZend_positionsrG   rH   r[   Zstart_logitsZ
end_logitsZignored_indexrI   Z
start_lossZend_lossZ
total_lossr   r   r   r#   Y  s8    >	









z#RobertaForQuestionAnswering.forward)NNNNNNNr]   r   r   r   r   rh   F  s   	       rh   )$r-   loggingr'   Ztorch.nnr   r   r   Zconfiguration_robertar   Z
file_utilsr   r   Zmodeling_bertr   r	   r
   r   r   Zmodeling_utilsr   	getLoggerr*   loggerr:   r   ZROBERTA_START_DOCSTRINGrJ   r/   r=   Moduler>   rV   r^   rc   rX   rh   r   r   r   r   <module>   s`   

)&UV^`