U
    &co                     @   sd  d dl Z d dlZd dlmZ ddlmZmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZ e eZdd	d
ddddZG dd dejjjZG dd dejjjZG dd dejjjZG dd deZG dd deZdZdZedeG dd deZedeG dd  d eZ G d!d" d"ejjjZ!ed#eG d$d% d%eZ"ed&eG d'd( d(eZ#dS ))    N)ElectraConfig   )add_start_docstrings add_start_docstrings_to_callable)ACT2FNTFBertEncoderTFBertPreTrainedModel)get_initializer
shape_list)BatchEncodingzEhttps://cdn.huggingface.co/google/electra-small-generator/tf_model.h5zDhttps://cdn.huggingface.co/google/electra-base-generator/tf_model.h5zEhttps://cdn.huggingface.co/google/electra-large-generator/tf_model.h5zIhttps://cdn.huggingface.co/google/electra-small-discriminator/tf_model.h5zHhttps://cdn.huggingface.co/google/electra-base-discriminator/tf_model.h5zIhttps://cdn.huggingface.co/google/electra-large-discriminator/tf_model.h5)zgoogle/electra-small-generatorzgoogle/electra-base-generatorzgoogle/electra-large-generatorz"google/electra-small-discriminatorz!google/electra-base-discriminatorz"google/electra-large-discriminatorc                       sH   e Zd ZdZ fddZ fddZddd	Zdd
dZdd Z  Z	S )TFElectraEmbeddingszLConstruct the embeddings from word, position and token_type embeddings.
    c                    s   t  jf | |j| _|j| _|j| _tjjj|j	|jt
| jdd| _tjjj|j|jt
| jdd| _tjjj|jdd| _tjj|j| _d S )Nposition_embeddings)Zembeddings_initializernametoken_type_embeddings	LayerNormepsilonr   )super__init__
vocab_sizeembedding_sizeinitializer_rangetfkeraslayersZ	EmbeddingZmax_position_embeddingsr	   r   Ztype_vocab_sizer   LayerNormalizationlayer_norm_epsr   Dropouthidden_dropout_probdropoutselfconfigkwargs	__class__ D/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_electra.pyr      s$    	zTFElectraEmbeddings.__init__c              	      sF   t d& | jd| j| jgt| jd| _W 5 Q R X t 	| dS )z"Build shared word embedding layer word_embeddingsZweight)shapeinitializerN)
r   Z
name_scope
add_weightr   r   r	   r   r(   r   buildr!   input_shaper$   r&   r'   r,   6   s    
zTFElectraEmbeddings.build	embeddingFc                 C   s:   |dkr| j ||dS |dkr(| |S td|dS )a	  Get token embeddings of inputs.
        Args:
            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
            mode: string, a valid value is one of "embedding" and "linear".
        Returns:
            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
                linear tensor, float32 with shape [batch_size, length, vocab_size].
        Raises:
            ValueError: if mode is not valid.

        Shared weights logic adapted from
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        r/   traininglinearzmode {} is not valid.N)
_embedding_linear
ValueErrorformat)r!   inputsmoder1   r&   r&   r'   callB   s
    
zTFElectraEmbeddings.callc                 C   s   |\}}}}|dk	rt |}nt |dd }|d }|dkr\tj|tjdtjddf }|dkrpt|d}|dkrt| j|}| |}	| 	|}
||	 |
 }| 
|}| j||d}|S )z)Applies embedding based on inputs tensor.Nr   )Zdtyper   r0   )r
   r   rangeZint32newaxisfillZgatherr(   r   r   r   r   )r!   r7   r1   	input_idsposition_idstoken_type_idsinputs_embedsr.   Z
seq_lengthr   r   
embeddingsr&   r&   r'   r3   X   s"    



zTFElectraEmbeddings._embeddingc                 C   sP   t |d }t |d }t|d| jg}tj|| jdd}t|||| jgS )a  Computes logits by running inputs through a linear layer.
            Args:
                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
            Returns:
                float32 tensor with shape [batch_size, length, vocab_size].
        r   r   r:   T)Ztranspose_b)r
   r   Zreshaper   matmulr(   r   )r!   r7   Z
batch_sizelengthxlogitsr&   r&   r'   r4   q   s
    zTFElectraEmbeddings._linear)r/   F)F)
__name__
__module____qualname____doc__r   r,   r9   r3   r4   __classcell__r&   r&   r$   r'   r      s   

r   c                       s&   e Zd Z fddZdddZ  ZS )!TFElectraDiscriminatorPredictionsc                    sB   t  jf | tjjj|jdd| _tjjjddd| _|| _	d S )Ndenser   r   dense_prediction)
r   r   r   r   r   Densehidden_sizerM   rO   r"   r    r$   r&   r'   r      s    z*TFElectraDiscriminatorPredictions.__init__Fc                 C   s.   |  |}t| jj |}t| |}|S N)rM   r   r"   
hidden_actr   ZsqueezerO   )r!   discriminator_hidden_statesr1   hidden_statesrF   r&   r&   r'   r9      s    
z&TFElectraDiscriminatorPredictions.call)FrG   rH   rI   r   r9   rK   r&   r&   r$   r'   rL      s   rL   c                       s&   e Zd Z fddZdddZ  ZS )TFElectraGeneratorPredictionsc                    s>   t  jf | tjjj|jdd| _tjjj|j	dd| _
d S )Nr   r   rM   rN   )r   r   r   r   r   r   r   r   rP   r   rM   r    r$   r&   r'   r      s    z&TFElectraGeneratorPredictions.__init__Fc                 C   s$   |  |}td |}| |}|S )NZgelu)rM   r   r   )r!   generator_hidden_statesr1   rU   r&   r&   r'   r9      s    

z"TFElectraGeneratorPredictions.call)FrV   r&   r&   r$   r'   rW      s   rW   c                   @   s(   e Zd ZeZeZdZdd Zdd Z	dS )TFElectraPreTrainedModelelectrac                 C   sN   |d krt |d}|d d t jt jd d f }t |t j}d| d }|S )Nr   g      ?g     )r   r=   r<   castZfloat32)r!   attention_maskr.   extended_attention_maskr&   r&   r'   get_extended_attention_mask   s    z4TFElectraPreTrainedModel.get_extended_attention_maskc                 C   s    |d k	rt nd g| jj }|S rR   )NotImplementedErrorr"   Znum_hidden_layers)r!   	head_maskr&   r&   r'   get_head_mask   s    z&TFElectraPreTrainedModel.get_head_maskN)
rG   rH   rI   r   config_class'TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixr^   ra   r&   r&   r&   r'   rY      s
   rY   c                       sB   e Zd ZeZ fddZdd Zdd Zdd ZdddZ	  Z
S )TFElectraMainLayerc                    sX   t  j|f| t|dd| _|j|jkr@tjjj	|jdd| _
t|dd| _|| _d S )NrB   rN   embeddings_projectencoder)r   r   r   rB   r   rQ   r   r   r   rP   re   r   rf   r"   r    r$   r&   r'   r      s    zTFElectraMainLayer.__init__c                 C   s   | j S rR   )rB   r!   r&   r&   r'   get_input_embeddings   s    z'TFElectraMainLayer.get_input_embeddingsc                 C   s   t d S rR   r_   )r!   Znew_num_tokensr&   r&   r'   _resize_token_embeddings   s    z+TFElectraMainLayer._resize_token_embeddingsc                 C   s   t dS )z Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        Nri   )r!   Zheads_to_pruner&   r&   r'   _prune_heads   s    zTFElectraMainLayer._prune_headsNFc                 C   s  t |ttfr|d }t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkstdnrt |ttfr|d	}|d
|}|d|}|d|}|d|}|d|}t|dkstdn|}|d k	r4|d k	r4tdn8|d k	rHt	|}	n$|d k	rdt	|d d }	ntd|d krt
|	d}|d krt
|	d}| ||	}
| |}| j||||g|d}t| dr| j||d}| j||
|g|d}|S )Nr   r                  zToo many inputs.r>   r\   r@   r?   r`   rA   zDYou cannot specify both input_ids and inputs_embeds at the same timer:   z5You have to specify either input_ids or inputs_embedsr0   re   )
isinstancetuplelistlenAssertionErrordictr   getr5   r
   r   r=   r^   ra   rB   hasattrre   rf   )r!   r7   r\   r@   r?   r`   rA   r1   r>   r.   r]   rU   r&   r&   r'   r9      sF    








zTFElectraMainLayer.call)NNNNNF)rG   rH   rI   r   rb   r   rh   rj   rk   r9   rK   r&   r&   r$   r'   rd      s   	
      rd   a  
    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
    Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.

    .. note::

        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Parameters:
        config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.ElectraTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
            (if set to :obj:`False`) for evaluation.

a\  The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the hidden size and embedding size are different.Both the generator and discriminator checkpoints may be loaded into this model.c                       s4   e Zd Z fddZdd Zeedd Z  ZS )TFElectraModelc                    s&   t  j|f|| t|dd| _d S )NrZ   rN   )r   r   rd   rZ   )r!   r"   r7   r#   r$   r&   r'   r   ]  s    zTFElectraModel.__init__c                 C   s   | j jS rR   rZ   rB   rg   r&   r&   r'   rh   a  s    z#TFElectraModel.get_input_embeddingsc                 K   s   | j |f|}|S )a  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import ElectraTokenizer, TFElectraModel

        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
        model = TFElectraModel.from_pretrained('google/electra-small-discriminator')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        )rZ   )r!   r7   r#   outputsr&   r&   r'   r9   d  s    zTFElectraModel.call	rG   rH   rI   r   rh   r   ELECTRA_INPUTS_DOCSTRINGr9   rK   r&   r&   r$   r'   ry   T  s   	ry   a8  
Electra model with a binary classification head on top as used during pre-training for identifying generated
tokens.

Even though both the discriminator and generator may be loaded into this model, the discriminator is
the only model of the two to have the correct classification head to be used for this model.c                       s6   e Zd Z fddZdd Zeed	ddZ  ZS )
TFElectraForPreTrainingc                    s0   t  j|f| t|dd| _t|dd| _d S )NrZ   rN   discriminator_predictions)r   r   rd   rZ   rL   r   r    r$   r&   r'   r     s    z TFElectraForPreTraining.__init__c                 C   s   | j jS rR   rz   rg   r&   r&   r'   rh     s    z,TFElectraForPreTraining.get_input_embeddingsNFc              	   C   sD   | j |||||||d}|d }	| |	}
|
f}||dd 7 }|S )aY  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
            Prediction scores of the head (scores for each token before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import ElectraTokenizer, TFElectraForPreTraining

        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
        model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        scores = outputs[0]
        r0   r   r   N)rZ   r   r!   r>   r\   r@   r?   r`   rA   r1   rT   Zdiscriminator_sequence_outputrF   outputr&   r&   r'   r9     s    '      
zTFElectraForPreTraining.call)NNNNNNFr|   r&   r&   r$   r'   r~     s   
       r~   c                       s2   e Zd Z fddZ fddZdddZ  ZS )	TFElectraMaskedLMHeadc                    s    t  jf | |j| _|| _d S rR   )r   r   r   input_embeddings)r!   r"   r   r#   r$   r&   r'   r     s    zTFElectraMaskedLMHead.__init__c                    s(   | j | jfdddd| _t | d S )NzerosTbias)r)   r*   Z	trainabler   )r+   r   r   r   r,   r-   r$   r&   r'   r,     s    zTFElectraMaskedLMHead.buildFc                 C   s   | j |dd}|| j }|S )Nr2   )r8   )r   r   )r!   rU   r1   r&   r&   r'   r9     s    
zTFElectraMaskedLMHead.call)F)rG   rH   rI   r   r,   r9   rK   r&   r&   r$   r'   r     s   r   z
Electra model with a language modeling head on top.

Even though both the discriminator and generator may be loaded into this model, the generator is
the only model of the two to have been trained for the masked language modeling task.c                       s>   e Zd Z fddZdd Zdd Zeedd	d
Z  Z	S )TFElectraForMaskedLMc                    sn   t  j|f| |j| _t|dd| _t|dd| _t|jt	rNt
|j | _n|j| _t|| jjdd| _d S )NrZ   rN   generator_predictionsgenerator_lm_head)r   r   r   rd   rZ   rW   r   rq   rS   strr   Z
activationr   rB   r   r    r$   r&   r'   r     s    zTFElectraForMaskedLM.__init__c                 C   s   | j jS rR   rz   rg   r&   r&   r'   rh     s    z)TFElectraForMaskedLM.get_input_embeddingsc                 C   s   | j S rR   )r   rg   r&   r&   r'   get_output_embeddings  s    z*TFElectraForMaskedLM.get_output_embeddingsNFc              	   C   sV   | j |||||||d}|d }	| j|	|d}
| j|
|d}
|
f}||dd 7 }|S )a  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import ElectraTokenizer, TFElectraForMaskedLM

        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
        model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        prediction_scores = outputs[0]

        r0   r   r   N)rZ   r   r   )r!   r>   r\   r@   r?   r`   rA   r1   rX   Zgenerator_sequence_outputZprediction_scoresr   r&   r&   r'   r9     s    (      zTFElectraForMaskedLM.call)NNNNNNF)
rG   rH   rI   r   rh   r   r   r}   r9   rK   r&   r&   r$   r'   r     s   	       r   z|
Electra model with a token classification head on top.

Both the discriminator and generator may be loaded into this model.c                       s.   e Zd Z fddZeedddZ  ZS )TFElectraForTokenClassificationc                    sJ   t  j|f| t|dd| _tjj|j| _	tjjj
|jdd| _d S )NrZ   rN   
classifier)r   r   rd   rZ   r   r   r   r   r   r   rP   Z
num_labelsr   r    r$   r&   r'   r   1  s    z(TFElectraForTokenClassification.__init__NFc              	   C   sN   | j |||||||d}|d }	| |	}	| |	}
|
f}||dd 7 }|S )aK  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import ElectraTokenizer, TFElectraForTokenClassification

        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
        model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        scores = outputs[0]
        r0   r   r   N)rZ   r   r   r   r&   r&   r'   r9   8  s    '      

z$TFElectraForTokenClassification.call)NNNNNNF)rG   rH   rI   r   r   r}   r9   rK   r&   r&   r$   r'   r   )  s          r   )$loggingZ
tensorflowr   Ztransformersr   Z
file_utilsr   r   Zmodeling_tf_bertr   r   r   Zmodeling_tf_utilsr	   r
   Ztokenization_utilsr   	getLoggerrG   loggerrc   r   r   ZLayerr   rL   rW   rY   rd   ZELECTRA_START_DOCSTRINGr}   ry   r~   r   r   r   r&   r&   r&   r'   <module>   sT   

g%R)	<G