U
    &c[b                     @   sZ  d Z ddlZddlZddlmZ ddlmZmZ ddl	m
Z
mZmZ ddlmZmZmZ eeZdd	d
ddZG dd de
ZG dd deZG dd deZdZdZedeG dd deZG dd dejjjZedeG dd deZG dd dejjjZ edeG d d! d!eZ!ed"eG d#d$ d$eZ"ed%eG d&d' d'eZ#dS )(z TF 2.0 RoBERTa model.     N   )RobertaConfig)add_start_docstrings add_start_docstrings_to_callable)TFBertEmbeddingsTFBertMainLayergelu)TFPreTrainedModelget_initializer
shape_listz3https://cdn.huggingface.co/roberta-base-tf_model.h5z4https://cdn.huggingface.co/roberta-large-tf_model.h5z9https://cdn.huggingface.co/roberta-large-mnli-tf_model.h5z9https://cdn.huggingface.co/distilroberta-base-tf_model.h5)zroberta-basezroberta-largezroberta-large-mnlizdistilroberta-basec                       s>   e Zd ZdZ fddZdd Zdd Zd fd	d
	Z  ZS )TFRobertaEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t  j|f| d| _d S )Nr   )super__init__padding_idxselfconfigkwargs	__class__ D/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_roberta.pyr   ,   s    zTFRobertaEmbeddings.__init__c                 C   s:   t jt j|| jt jd}t jj|dd| }|| j S )a   Replace non-padding symbols with their position numbers. Position numbers begin at
        padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
        `utils.make_positions`.
        :param tf.Tensor x:
        :return tf.Tensor:
        Zdtyper   Zaxis)tfcastmath	not_equalr   int32Zcumsum)r   xmaskZincremental_indiciesr   r   r   "create_position_ids_from_input_ids0   s    z6TFRobertaEmbeddings.create_position_ids_from_input_idsc                 C   s@   t |d }tj| jd || j d tjdtjddf }|S )z We are provided embeddings directly. We cannot infer which are padded so just generate
        sequential position ids.
        :param tf.Tensor inputs_embeds:
        :return tf.Tensor:
        r   r   N)r   r   ranger   r   Znewaxis)r   inputs_embedsZ
seq_lengthposition_idsr   r   r   &create_position_ids_from_inputs_embeds;   s    0z:TFRobertaEmbeddings.create_position_ids_from_inputs_embedsFc                    sJ   |\}}}}|dkr2|dk	r(|  |}n
| |}t j||||g|dS )z)Applies embedding based on inputs tensor.Ntraining)r!   r%   r   
_embedding)r   inputsr'   Z	input_idsr$   Ztoken_type_idsr#   r   r   r   r(   F   s    
zTFRobertaEmbeddings._embedding)F)	__name__
__module____qualname____doc__r   r!   r%   r(   __classcell__r   r   r   r   r   '   s
   r   c                       s(   e Zd ZdZ fddZdd Z  ZS )TFRobertaMainLayerz?
    Same as TFBertMainLayer but uses TFRobertaEmbeddings.
    c                    s"   t  j|f| t|dd| _d S )N
embeddingsname)r   r   r   r0   r   r   r   r   r   Y   s    zTFRobertaMainLayer.__init__c                 C   s   | j S N)r0   r   r   r   r   get_input_embeddings]   s    z'TFRobertaMainLayer.get_input_embeddings)r*   r+   r,   r-   r   r5   r.   r   r   r   r   r/   T   s   r/   c                   @   s   e Zd ZdZeZeZdZdS )TFRobertaPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    robertaN)	r*   r+   r,   r-   r   Zconfig_class'TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixr   r   r   r   r6   a   s   r6   a  
    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
    Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.

    .. note::

        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Parameters:
        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.RobertaTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`__
        position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`__
        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
            (if set to :obj:`False`) for evaluation.
z`The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.c                       s,   e Zd Z fddZeedd Z  ZS )TFRobertaModelc                    s&   t  j|f|| t|dd| _d S )Nr7   r1   )r   r   r/   r7   r   r   r)   r   r   r   r   r      s    zTFRobertaModel.__init__c                 K   s   | j |f|}|S )a  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
            Last layer hidden-state of the first token of the sequence (classification token)
            further processed by a Linear layer and a Tanh activation function. The Linear
            layer weights are trained from the next sentence prediction (classification)
            objective during Bert pretraining. This output is usually *not* a good summary
            of the semantic content of the input, you're often better with averaging or pooling
            the sequence of hidden-states for the whole input sequence.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import RobertaTokenizer, TFRobertaModel

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = TFRobertaModel.from_pretrained('roberta-base')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        )r7   )r   r)   r   outputsr   r   r   call   s    %zTFRobertaModel.callr*   r+   r,   r   r   ROBERTA_INPUTS_DOCSTRINGr<   r.   r   r   r   r   r9      s   r9   c                       s4   e Zd ZdZ fddZ fddZdd Z  ZS )TFRobertaLMHeadz*Roberta Head for masked language modeling.c                    sd   t  jf | |j| _tjjj|jt|j	dd| _
tjjj|jdd| _tjjt| _|| _d S )Ndensekernel_initializerr2   
layer_norm)epsilonr2   )r   r   
vocab_sizer   keraslayersDensehidden_sizer
   initializer_ranger@   ZLayerNormalizationZlayer_norm_epsrC   Z
Activationr   actdecoder)r   r   Zinput_embeddingsr   r   r   r   r      s      zTFRobertaLMHead.__init__c                    s(   | j | jfdddd| _t | d S )NzerosTbias)shapeZinitializerZ	trainabler2   )Z
add_weightrE   rN   r   build)r   Zinput_shaper   r   r   rP      s    zTFRobertaLMHead.buildc                 C   s6   |  |}| |}| |}| j|dd| j }|S )NZlinear)mode)r@   rK   rC   rL   rN   )r   featuresr   r   r   r   r<      s
    


zTFRobertaLMHead.call)r*   r+   r,   r-   r   rP   r<   r.   r   r   r   r   r?      s   r?   z6RoBERTa Model with a `language modeling` head on top. c                       s4   e Zd Z fddZdd Zeedd Z  ZS )TFRobertaForMaskedLMc                    s:   t  j|f|| t|dd| _t|| jjdd| _d S )Nr7   r1   lm_head)r   r   r/   r7   r?   r0   rT   r:   r   r   r   r     s    zTFRobertaForMaskedLM.__init__c                 C   s   | j jS r3   )rT   rL   r4   r   r   r   get_output_embeddings  s    z*TFRobertaForMaskedLM.get_output_embeddingsc                 K   s6   | j |f|}|d }| |}|f|dd  }|S )as  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import RobertaTokenizer, TFRobertaForMaskedLM

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        prediction_scores = outputs[0]

        r      N)r7   rT   )r   r)   r   r;   sequence_outputZprediction_scoresr   r   r   r<     s
    
zTFRobertaForMaskedLM.call)	r*   r+   r,   r   rU   r   r>   r<   r.   r   r   r   r   rS     s   rS   c                       s*   e Zd ZdZ fddZdddZ  ZS )TFRobertaClassificationHeadz-Head for sentence-level classification tasks.c                    sd   t  j|f| tjjj|jt|jddd| _	tjj
|j| _tjjj|jt|jdd| _d S )Ntanhr@   )rB   Z
activationr2   out_projrA   )r   r   r   rF   rG   rH   rI   r
   rJ   r@   Dropouthidden_dropout_probdropout
num_labelsrZ   r   r   r   r   r   9  s      z$TFRobertaClassificationHead.__init__Fc                 C   sJ   |d d dd d f }| j ||d}| |}| j ||d}| |}|S )Nr   r&   )r]   r@   rZ   )r   rR   r'   r   r   r   r   r<   F  s    

z TFRobertaClassificationHead.call)F)r*   r+   r,   r-   r   r<   r.   r   r   r   r   rX   6  s   rX   zRoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
    on top of the pooled output) e.g. for GLUE tasks. c                       s,   e Zd Z fddZeedd Z  ZS )"TFRobertaForSequenceClassificationc                    s<   t  j|f|| |j| _t|dd| _t|dd| _d S )Nr7   r1   
classifier)r   r   r^   r/   r7   rX   r`   r:   r   r   r   r   U  s    z+TFRobertaForSequenceClassification.__init__c                 K   sB   | j |f|}|d }| j||ddd}|f|dd  }|S )a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import RobertaTokenizer, TFRobertaForSequenceClassification

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        labels = tf.constant([1])[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]

        r   r'   Fr&   rV   N)r7   r`   getr   r)   r   r;   rW   logitsr   r   r   r<   \  s
    z'TFRobertaForSequenceClassification.callr=   r   r   r   r   r_   O  s   r_   zRoBERTa Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. c                       s,   e Zd Z fddZeedd Z  ZS )TFRobertaForTokenClassificationc                    s^   t  j|f|| |j| _t|dd| _tjj|j	| _
tjjj|jt|jdd| _d S )Nr7   r1   r`   rA   )r   r   r^   r/   r7   r   rF   rG   r[   r\   r]   rH   r
   rJ   r`   r:   r   r   r   r     s      z(TFRobertaForTokenClassification.__init__c                 K   sL   | j |f|}|d }| j||ddd}| |}|f|dd  }|S )a8  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import RobertaTokenizer, TFRobertaForTokenClassification

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        scores = outputs[0]

        r   r'   Fr&   rV   N)r7   r]   ra   r`   rb   r   r   r   r<     s    
z$TFRobertaForTokenClassification.callr=   r   r   r   r   rd     s   
rd   zRoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). c                       s,   e Zd Z fddZeedd Z  ZS )TFRobertaForQuestionAnsweringc                    sL   t  j|f|| |j| _t|dd| _tjjj|jt	|j
dd| _d S )Nr7   r1   
qa_outputsrA   )r   r   r^   r/   r7   r   rF   rG   rH   r
   rJ   rf   r:   r   r   r   r     s      z&TFRobertaForQuestionAnswering.__init__c                 K   sh   | j |f|}|d }| |}tj|ddd\}}tj|dd}tj|dd}||f|dd  }|S )a@  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-start scores (before SoftMax).
        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        # The checkpoint roberta-base is not fine-tuned for question answering. Please see the
        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.

        import tensorflow as tf
        from transformers import RobertaTokenizer, TFRobertaForQuestionAnswering

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
        input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet")
        start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1

        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])

        r   rV   r   N)r7   rf   r   splitZsqueeze)r   r)   r   r;   rW   rc   Zstart_logitsZ
end_logitsr   r   r   r<     s    %
z"TFRobertaForQuestionAnswering.callr=   r   r   r   r   re     s   	re   )$r-   loggingZ
tensorflowr   Zconfiguration_robertar   Z
file_utilsr   r   Zmodeling_tf_bertr   r   r   Zmodeling_tf_utilsr	   r
   r   	getLoggerr*   loggerr8   r   r/   r6   ZROBERTA_START_DOCSTRINGr>   r9   rF   rG   ZLayerr?   rS   rX   r_   rd   re   r   r   r   r   <module>   sR   
-
).215