U
    &ºco  ã                   @   sB  d dl Z d dlZd dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
mZ ddlmZmZmZmZ e  e¡Zddd	d
dddœZd'dd„ZG dd„ deƒZG dd„ dejƒZG dd„ dejƒZG dd„ deƒZdZdZe
deƒG dd„ deƒƒZe
deƒG dd „ d eƒƒZe
d!eƒG d"d#„ d#eƒƒZe
d$eƒG d%d&„ d&eƒƒZ dS )(é    Né   )Úget_activation)ÚElectraConfig)Úadd_start_docstringsÚ add_start_docstrings_to_callable)ÚBertEmbeddingsÚBertEncoderÚBertLayerNormÚBertPreTrainedModelzKhttps://cdn.huggingface.co/google/electra-small-generator/pytorch_model.binzJhttps://cdn.huggingface.co/google/electra-base-generator/pytorch_model.binzKhttps://cdn.huggingface.co/google/electra-large-generator/pytorch_model.binzOhttps://cdn.huggingface.co/google/electra-small-discriminator/pytorch_model.binzNhttps://cdn.huggingface.co/google/electra-base-discriminator/pytorch_model.binzOhttps://cdn.huggingface.co/google/electra-large-discriminator/pytorch_model.bin)zgoogle/electra-small-generatorzgoogle/electra-base-generatorzgoogle/electra-large-generatorz"google/electra-small-discriminatorz!google/electra-base-discriminatorz"google/electra-large-discriminatorÚdiscriminatorc                 C   s  zddl }ddl}ddl}W n  tk
r<   t d¡ ‚ Y nX tj |¡}t 	d 
|¡¡ |j |¡}g }	g }
|D ]<\}}t 	d 
||¡¡ |j ||¡}|	 |¡ |
 |¡ qrt|	|
ƒD ]Z\}}|}zt| tƒrâ| dd¡}|dkr| d	d
¡}| dd	¡}| dd¡}| dd¡}| d¡}tdd„ |D ƒƒrNt 	d 
|¡¡ W qº| }|D ]Ü}| d|¡rv| d|¡}n|g}|d dks˜|d dkr¤t|dƒ}nj|d dksÀ|d dkrÌt|dƒ}nB|d dkræt|dƒ}n(|d dkr t|dƒ}nt||d ƒ}t|ƒdkrVt|d  ƒ}|| }qV| d!¡rLt|dƒ}n|dkr`| |¡}z|j|jksxt|ƒ‚W n< tk
r¶ } z| j|j|jf7  _‚ W 5 d}~X Y nX td" 
|¡|ƒ t  |¡|_!W qº t"k
r } ztd 
|¡||ƒ W Y ¢qºW 5 d}~X Y qºX qº| S )#z- Load tf checkpoints in a pytorch model.
    r   Nz™Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z(Converting TensorFlow checkpoint from {}z"Loading TF weight {} with shape {}zelectra/embeddings/zgenerator/embeddings/Ú	generatorzelectra/zdiscriminator/z
generator/Zdense_1Údense_predictionz!generator_predictions/output_biaszgenerator_lm_head/biasú/c                 s   s   | ]}|d kV  qdS ))Zglobal_stepZtemperatureN© )Ú.0Únr   r   úA/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_electra.pyÚ	<genexpr>D   s     z-load_tf_weights_in_electra.<locals>.<genexpr>zSkipping {}z[A-Za-z]+_\d+z_(\d+)ÚkernelÚgammaZweightZoutput_biasÚbetaZbiasZoutput_weightsZsquadÚ
classifieré   r   Z_embeddingszInitialize PyTorch weight {})#ÚreZnumpyZ
tensorflowÚImportErrorÚloggerÚerrorÚosÚpathÚabspathÚinfoÚformatZtrainZlist_variablesZload_variableÚappendÚzipÚ
isinstanceÚElectraForMaskedLMÚreplaceÚsplitÚanyÚ	fullmatchÚgetattrÚlenÚintÚendswithZ	transposeÚshapeÚAssertionErrorÚargsÚprintÚtorchZ
from_numpyÚdataÚAttributeError)ÚmodelÚconfigZtf_checkpoint_pathZdiscriminator_or_generatorr   ÚnpÚtfZtf_pathZ	init_varsÚnamesZarraysÚnamer.   ÚarrayÚoriginal_nameÚpointerZm_nameZscope_namesÚnumÚer   r   r   Úload_tf_weights_in_electra   s‚    ÿ





r@   c                       s    e Zd ZdZ‡ fdd„Z‡  ZS )ÚElectraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s^   t ƒ  |¡ tj|j|j|jd| _t |j|j¡| _	t |j
|j¡| _t|j|jd| _d S )N)Zpadding_idx)Zeps)ÚsuperÚ__init__ÚnnZ	EmbeddingÚ
vocab_sizeÚembedding_sizeZpad_token_idÚword_embeddingsZmax_position_embeddingsZposition_embeddingsZtype_vocab_sizeZtoken_type_embeddingsr	   Zlayer_norm_epsÚ	LayerNorm©Úselfr6   ©Ú	__class__r   r   rC   n   s
    zElectraEmbeddings.__init__)Ú__name__Ú
__module__Ú__qualname__Ú__doc__rC   Ú__classcell__r   r   rK   r   rA   k   s   rA   c                       s(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚElectraDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.c                    s6   t ƒ  ¡  t |j|j¡| _t |jd¡| _|| _d S )Nr   )rB   rC   rD   ÚLinearÚhidden_sizeÚdenser   r6   rI   rK   r   r   rC   |   s    
z(ElectraDiscriminatorPredictions.__init__c                 C   s,   |   |¡}t| jjƒ|ƒ}|  |¡ ¡ }|S ©N)rU   r   r6   Z
hidden_actr   Zsqueeze)rJ   Údiscriminator_hidden_statesÚattention_maskÚhidden_statesÚlogitsr   r   r   Úforwardƒ   s    
z'ElectraDiscriminatorPredictions.forward©rM   rN   rO   rP   rC   r[   rQ   r   r   rK   r   rR   y   s   rR   c                       s(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚElectraGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                    s,   t ƒ  ¡  t|jƒ| _t |j|j¡| _d S rV   )	rB   rC   r	   rF   rH   rD   rS   rT   rU   rI   rK   r   r   rC   Ž   s    
z$ElectraGeneratorPredictions.__init__c                 C   s$   |   |¡}tdƒ|ƒ}|  |¡}|S )NZgelu)rU   r   rH   )rJ   Úgenerator_hidden_statesrY   r   r   r   r[   ”   s    

z#ElectraGeneratorPredictions.forwardr\   r   r   rK   r   r]   ‹   s   r]   c                   @   s    e Zd ZdZeZeZeZ	dZ
dS )ÚElectraPreTrainedModelz† An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    ÚelectraN)rM   rN   rO   rP   r   Úconfig_classÚ$ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapr@   Zload_tf_weightsZbase_model_prefixr   r   r   r   r_   œ   s
   r_   ap  
    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
aÊ  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.ElectraTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            if the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
            is used in the cross-attention if the model is configured as a decoder.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
a\  The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the hidden size and embedding size are different.Both the generator and discriminator checkpoints may be loaded into this model.c                       sJ   e Zd ZeZ‡ fdd„Zdd„ Zdd„ Zdd„ Ze	e
ƒdd
d„ƒZ‡  ZS )ÚElectraModelc                    sP   t ƒ  |¡ t|ƒ| _|j|jkr4t |j|j¡| _t	|ƒ| _
|| _|  ¡  d S rV   )rB   rC   rA   Ú
embeddingsrF   rT   rD   rS   Úembeddings_projectr   Úencoderr6   Úinit_weightsrI   rK   r   r   rC   ì   s    

zElectraModel.__init__c                 C   s   | j jS rV   ©rd   rG   ©rJ   r   r   r   Úget_input_embeddings÷   s    z!ElectraModel.get_input_embeddingsc                 C   s   || j _d S rV   rh   )rJ   Úvaluer   r   r   Úset_input_embeddingsú   s    z!ElectraModel.set_input_embeddingsc                 C   s*   |  ¡ D ]\}}| jj| j |¡ qdS )z¥ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        N)Úitemsrf   ÚlayerZ	attentionZprune_heads)rJ   Zheads_to_prunern   Zheadsr   r   r   Ú_prune_headsý   s    zElectraModel._prune_headsNc                 C   sê   |dk	r|dk	rt dƒ‚n4|dk	r,| ¡ }n"|dk	rF| ¡ dd… }nt dƒ‚|dk	r\|jn|j}|dkrxtj||d}|dkr’tj|tj|d}|  |||¡}	|  || j	j
¡}| j||||d}
t| dƒrÖ|  |
¡}
| j|
|	|d	}
|
S )
aÐ  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import ElectraModel, ElectraTokenizer
        import torch

        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
        model = ElectraModel.from_pretrained('google/electra-small-discriminator')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)

        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        NzDYou cannot specify both input_ids and inputs_embeds at the same timeéÿÿÿÿz5You have to specify either input_ids or inputs_embeds)Údevice)Zdtyperq   )Ú	input_idsÚposition_idsÚtoken_type_idsÚinputs_embedsre   )rX   Ú	head_mask)Ú
ValueErrorÚsizerq   r2   ZonesÚzerosÚlongZget_extended_attention_maskZget_head_maskr6   Znum_hidden_layersrd   Úhasattrre   rf   )rJ   rr   rX   rt   rs   rv   ru   Zinput_shaperq   Zextended_attention_maskrY   r   r   r   r[     s0    )

   ÿ

zElectraModel.forward)NNNNNN)rM   rN   rO   r   ra   rC   rj   rl   ro   r   ÚELECTRA_INPUTS_DOCSTRINGr[   rQ   r   r   rK   r   rc   à   s   
      ùrc   zÊ
    Electra model with a binary classification head on top as used during pre-training for identifying generated
    tokens.

    It is recommended to load the discriminator checkpoint into that model.c                       s.   e Zd Z‡ fdd„Zeeƒddd„ƒZ‡  ZS )ÚElectraForPreTrainingc                    s,   t ƒ  |¡ t|ƒ| _t|ƒ| _|  ¡  d S rV   )rB   rC   rc   r`   rR   Údiscriminator_predictionsrg   rI   rK   r   r   rC   V  s    

zElectraForPreTraining.__init__Nc                 C   sÄ   |   ||||||¡}|d }	|  |	|¡}
|
f}|dk	r°t ¡ }|dk	rŠ| d|	jd ¡dk}|
 d|	jd ¡| }|| }||| ¡ ƒ}n||
 d|	jd ¡| ¡ ƒ}|f| }||dd… 7 }|S )aË  
        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
            Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring)
            Indices should be in ``[0, 1]``.
            ``0`` indicates the token is an original token,
            ``1`` indicates the token was replaced.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
        loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Total loss of the ELECTRA objective.
        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`)
            Prediction scores of the head (scores for each token before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.


    Examples::

        from transformers import ElectraTokenizer, ElectraForPreTraining
        import torch

        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
        model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)

        prediction_scores, seq_relationship_scores = outputs[:2]

        r   Nrp   r   )r`   r~   rD   ZBCEWithLogitsLossÚviewr.   Úfloat©rJ   rr   rX   rt   rs   rv   ru   ÚlabelsrW   Zdiscriminator_sequence_outputrZ   ÚoutputÚloss_fctZactive_lossZactive_logitsZactive_labelsÚlossr   r   r   r[   ]  s,    4     ÿ
zElectraForPreTraining.forward)NNNNNNN©rM   rN   rO   rC   r   r|   r[   rQ   r   r   rK   r   r}   M  s   	       ør}   zø
    Electra model with a language modeling head on top.

    Even though both the discriminator and generator may be loaded into this model, the generator is
    the only model of the two to have been trained for the masked language modeling task.c                       s6   e Zd Z‡ fdd„Zdd„ Zeeƒddd„ƒZ‡  ZS )	r%   c                    s>   t ƒ  |¡ t|ƒ| _t|ƒ| _t |j|j	¡| _
|  ¡  d S rV   )rB   rC   rc   r`   r]   Úgenerator_predictionsrD   rS   rF   rE   Úgenerator_lm_headrg   rI   rK   r   r   rC   ´  s
    

zElectraForMaskedLM.__init__c                 C   s   | j S rV   )rˆ   ri   r   r   r   Úget_output_embeddings½  s    z(ElectraForMaskedLM.get_output_embeddingsNc                 C   s€   |   ||||||¡}|d }	|  |	¡}
|  |
¡}
|
f}|dk	rlt ¡ }||
 d| jj¡| d¡ƒ}|f| }||dd… 7 }|S )ax	  
        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the masked language modeling loss.
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
            in ``[0, ..., config.vocab_size]``

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
        masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Masked language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

        Examples::

            from transformers import ElectraTokenizer, ElectraForMaskedLM
            import torch

            tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
            model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')

            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
            outputs = model(input_ids, masked_lm_labels=input_ids)

            loss, prediction_scores = outputs[:2]

        r   Nrp   r   )r`   r‡   rˆ   rD   ÚCrossEntropyLossr   r6   rE   )rJ   rr   rX   rt   rs   rv   ru   Zmasked_lm_labelsr^   Zgenerator_sequence_outputZprediction_scoresrƒ   r„   r…   r   r   r   r[   À  s$    3     ÿ


zElectraForMaskedLM.forward)NNNNNNN)	rM   rN   rO   rC   r‰   r   r|   r[   rQ   r   r   rK   r   r%   «  s   		       ør%   z„
    Electra model with a token classification head on top.

    Both the discriminator and generator may be loaded into this model.c                       s.   e Zd Z‡ fdd„Zeeƒddd„ƒZ‡  ZS )ÚElectraForTokenClassificationc                    sB   t ƒ  |¡ t|ƒ| _t |j¡| _t |j	|j
¡| _|  ¡  d S rV   )rB   rC   rc   r`   rD   ZDropoutZhidden_dropout_probÚdropoutrS   rT   Ú
num_labelsr   rg   rI   rK   r   r   rC     s
    
z&ElectraForTokenClassification.__init__Nc                 C   sÄ   |   ||||||¡}|d }	|  |	¡}	|  |	¡}
|
f}|dk	r°t ¡ }|dk	rŠ| d¡dk}|
 d| jj¡| }| d¡| }|||ƒ}n||
 d| jj¡| d¡ƒ}|f| }||dd… 7 }|S )a  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the token classification loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
            Classification loss.
        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import ElectraTokenizer, ElectraForTokenClassification
        import torch

        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
        model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, scores = outputs[:2]

        r   Nrp   r   )r`   rŒ   r   rD   rŠ   r   r6   r   r   r   r   r   r[     s.    2     ÿ


z%ElectraForTokenClassification.forward)NNNNNNNr†   r   r   rK   r   r‹     s          ør‹   )r   )!Úloggingr   r2   Ztorch.nnrD   Zactivationsr   Zconfiguration_electrar   Z
file_utilsr   r   Zmodeling_bertr   r   r	   r
   Ú	getLoggerrM   r   rb   r@   rA   ÚModulerR   r]   r_   ZELECTRA_START_DOCSTRINGr|   rc   r}   r%   r‹   r   r   r   r   Ú<module>   sT   
ú

Q.úeúVúUû