# coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ TF 2.0 RoBERTa model. """ import logging import tensorflow as tf from .configuration_roberta import RobertaConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list logger = logging.getLogger(__name__) TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { "roberta-base": "https://cdn.huggingface.co/roberta-base-tf_model.h5", "roberta-large": "https://cdn.huggingface.co/roberta-large-tf_model.h5", "roberta-large-mnli": "https://cdn.huggingface.co/roberta-large-mnli-tf_model.h5", "distilroberta-base": "https://cdn.huggingface.co/distilroberta-base-tf_model.h5", } class TFRobertaEmbeddings(TFBertEmbeddings): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.padding_idx = 1 def create_position_ids_from_input_ids(self, x): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols are ignored. This is modified from fairseq's `utils.make_positions`. :param tf.Tensor x: :return tf.Tensor: """ mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) incremental_indicies = tf.math.cumsum(mask, axis=1) * mask return incremental_indicies + self.padding_idx def create_position_ids_from_inputs_embeds(self, inputs_embeds): """ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. :param tf.Tensor inputs_embeds: :return tf.Tensor: """ seq_length = shape_list(inputs_embeds)[1] position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] return position_ids def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" input_ids, position_ids, token_type_ids, inputs_embeds = inputs if position_ids is None: if input_ids is not None: # Create the position ids from the input token ids. Any padded tokens remain padded. position_ids = self.create_position_ids_from_input_ids(input_ids) else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) class TFRobertaMainLayer(TFBertMainLayer): """ Same as TFBertMainLayer but uses TFRobertaEmbeddings. """ def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.embeddings = TFRobertaEmbeddings(config, name="embeddings") def get_input_embeddings(self): return self.embeddings class TFRobertaPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = RobertaConfig pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "roberta" ROBERTA_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. .. note:: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: :obj:`model(inputs)`. If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ ROBERTA_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers.RobertaTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.encode_plus` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token `What are token type IDs? <../glossary.html#token-type-ids>`__ position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`__ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. """ @add_start_docstrings( "The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.", ROBERTA_START_DOCSTRING, ) class TFRobertaModel(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, name="roberta") @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers import RobertaTokenizer, TFRobertaModel tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaModel.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.roberta(inputs, **kwargs) return outputs class TFRobertaLMHead(tf.keras.layers.Layer): """Roberta Head for masked language modeling.""" def __init__(self, config, input_embeddings, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.act = tf.keras.layers.Activation(gelu) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = input_embeddings def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) def call(self, features): x = self.dense(features) x = self.act(x) x = self.layer_norm(x) # project back to size of vocabulary with bias x = self.decoder(x, mode="linear") + self.bias return x @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING) class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, name="roberta") self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") def get_output_embeddings(self): return self.lm_head.decoder @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers import RobertaTokenizer, TFRobertaForMaskedLM tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForMaskedLM.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here return outputs # prediction_scores, (hidden_states), (attentions) class TFRobertaClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) x = self.dropout(x, training=training) x = self.dense(x) x = self.dropout(x, training=training) x = self.out_proj(x) return x @add_start_docstrings( """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ROBERTA_START_DOCSTRING, ) class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") self.classifier = TFRobertaClassificationHead(config, name="classifier") @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers import RobertaTokenizer, TFRobertaForSequenceClassification tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForSequenceClassification.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 labels = tf.constant([1])[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] logits = self.classifier(sequence_output, training=kwargs.get("training", False)) outputs = (logits,) + outputs[2:] return outputs # logits, (hidden_states), (attentions) @add_start_docstrings( """RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, ROBERTA_START_DOCSTRING, ) class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers import RobertaTokenizer, TFRobertaForTokenClassification tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForTokenClassification.from_pretrained('roberta-base') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # scores, (hidden_states), (attentions) @add_start_docstrings( """RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, ROBERTA_START_DOCSTRING, ) class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # The checkpoint roberta-base is not fine-tuned for question answering. Please see the # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task. import tensorflow as tf from transformers import RobertaTokenizer, TFRobertaForQuestionAnswering tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base') input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet") start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1 all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1]) """ outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + outputs[2:] return outputs # start_logits, end_logits, (hidden_states), (attentions)