U
    &ºc^Å  ã                	   @   sÚ  d Z ddlZddlZddlZddlZddlmZ ddlmZmZ ddl	m
Z
 ddlmZmZ ddlmZmZmZmZ ddlmZ e e¡Zd	d
dddddddœZdd„ ZG dd„ deƒZG dd„ deƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ deƒZ d Z!d!Z"ed"e!ƒG d#d$„ d$e ƒƒZ#ed%e!ƒG d&d'„ d'e ƒƒZ$G d(d)„ d)ejƒZ%G d*d+„ d+ejƒZ&ed,e!ƒG d-d.„ d.e ƒƒZ'ed/e!ƒG d0d1„ d1e ƒƒZ(ed2e!ƒG d3d4„ d4e ƒƒZ)ed5e!ƒG d6d7„ d7e ƒƒZ*dS )8zPyTorch ALBERT model. é    N)ÚCrossEntropyLossÚMSELossé   )ÚAlbertConfig)Úadd_start_docstringsÚ add_start_docstrings_to_callable)ÚACT2FNÚBertEmbeddingsÚBertSelfAttentionÚprune_linear_layer)ÚPreTrainedModelz;https://cdn.huggingface.co/albert-base-v1-pytorch_model.binz<https://cdn.huggingface.co/albert-large-v1-pytorch_model.binz=https://cdn.huggingface.co/albert-xlarge-v1-pytorch_model.binz>https://cdn.huggingface.co/albert-xxlarge-v1-pytorch_model.binz;https://cdn.huggingface.co/albert-base-v2-pytorch_model.binz<https://cdn.huggingface.co/albert-large-v2-pytorch_model.binz=https://cdn.huggingface.co/albert-xlarge-v2-pytorch_model.binz>https://cdn.huggingface.co/albert-xxlarge-v2-pytorch_model.bin)zalbert-base-v1zalbert-large-v1zalbert-xlarge-v1zalbert-xxlarge-v1zalbert-base-v2zalbert-large-v2zalbert-xlarge-v2zalbert-xxlarge-v2c                 C   s*  zddl }ddl}ddl}W n  tk
r<   t d¡ ‚ Y nX tj |¡}t 	d 
|¡¡ |j |¡}g }g }	|D ]<\}
}t 	d 
|
|¡¡ |j ||
¡}| |
¡ |	 |¡ qrt||	ƒD ]\}
}t|
ƒ qºt||	ƒD ]L\}
}|
}|
 dd¡}
|
 dd	¡}
|
 d
d¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd ¡}
|
 d!d"¡}
|
 d#d$¡}
t|
 d¡ƒd%krìd&|
ksäd'|
krìd(|
 }
d)|
kr|
 d*d+¡}
|
 d,d-¡}
|
 d¡}
d.|
ksJd/|
ksJd0|
ksJd1|
ksJd2|
krbt 	d3 
d |
¡¡¡ qÖ| }|
D ]}| d4|¡rŒ| d5|¡}n|g}|d d6ks®|d d7krºt|d-ƒ}n¢|d d&ksÖ|d d8krât|d9ƒ}nz|d d'krüt|d-ƒ}n`|d d:krt|d;ƒ}nFzt||d ƒ}W n2 tk
rZ   t 	d3 
d |
¡¡¡ Y qjY nX t|ƒd<krjt|d% ƒ}|| }qj|d=d… d>kr t|d-ƒ}n|d6kr´| |¡}z|j|jksÈt‚W n< tk
r } z| j|j|jf7  _‚ W 5 d}~X Y nX td? 
|
|¡ƒ t |¡|_qÖ| S )@z( Load tf checkpoints in a pytorch model.r   Nz™Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z(Converting TensorFlow checkpoint from {}z"Loading TF weight {} with shape {}zmodule/Ú Zffn_1Úffnzbert/zalbert/Zattention_1Ú	attentionz
transform/ZLayerNorm_1Úfull_layer_layer_normÚ	LayerNormzattention/LayerNormztransformer/zintermediate/dense/zffn/intermediate/output/dense/zffn_output/z/output/ú/z/self/zpooler/denseÚpoolerzcls/predictionsÚpredictionszpredictions/attentionzembeddings/attentionÚ
embeddingsZinner_group_zalbert_layers/Zgroup_zalbert_layer_groups/r   Zoutput_biasZoutput_weightszclassifier/Zseq_relationshipzseq_relationship/output_zsop_classifier/classifier/ÚweightsÚweightZadam_mZadam_vZAdamWeightDecayOptimizerZAdamWeightDecayOptimizer_1Zglobal_stepzSkipping {}z[A-Za-z]+_\d+z_(\d+)ÚkernelÚgammaÚbetaÚbiasZsquadÚ
classifieré   iõÿÿÿZ_embeddingsz$Initialize PyTorch weight {} from {}) ÚreZnumpyZ
tensorflowÚImportErrorÚloggerÚerrorÚosÚpathÚabspathÚinfoÚformatZtrainZlist_variablesZload_variableÚappendÚzipÚprintÚreplaceÚlenÚsplitÚjoinÚ	fullmatchÚgetattrÚAttributeErrorÚintÚ	transposeÚshapeÚAssertionErrorÚargsÚtorchZ
from_numpyÚdata)ÚmodelÚconfigZtf_checkpoint_pathr   ÚnpÚtfZtf_pathZ	init_varsÚnamesZarraysÚnamer3   ÚarrayÚoriginal_nameÚpointerZm_nameZscope_namesÚnumÚe© rC   ú@/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_albert.pyÚload_tf_weights_in_albert.   s¸    ÿ

(

ÿþýüû


rE   c                       s    e Zd ZdZ‡ fdd„Z‡  ZS )ÚAlbertEmbeddingszQ
    Construct the embeddings from word, position and token_type embeddings.
    c                    sb   t ƒ  |¡ tj|j|j|jd| _t |j|j¡| _	t |j
|j¡| _tjj|j|jd| _d S )N)Zpadding_idx©Zeps)ÚsuperÚ__init__ÚnnÚ	EmbeddingÚ
vocab_sizeÚembedding_sizeZpad_token_idÚword_embeddingsZmax_position_embeddingsZposition_embeddingsZtype_vocab_sizeZtoken_type_embeddingsr6   r   Úlayer_norm_eps©Úselfr9   ©Ú	__class__rC   rD   rI   ¯   s
    zAlbertEmbeddings.__init__)Ú__name__Ú
__module__Ú__qualname__Ú__doc__rI   Ú__classcell__rC   rC   rR   rD   rF   ª   s   rF   c                       s.   e Zd Z‡ fdd„Zdd„ Zddd„Z‡  ZS )	ÚAlbertAttentionc                    sr   t ƒ  |¡ |j| _|j| _|j| _|j|j | _t |j¡| _	t 
|j|j¡| _tj|j|jd| _tƒ | _d S ©NrG   )rH   rI   Úoutput_attentionsÚnum_attention_headsÚhidden_sizeÚattention_head_sizerJ   ÚDropoutZattention_probs_dropout_probÚdropoutÚLinearÚdenser   rO   ÚsetÚpruned_headsrP   rR   rC   rD   rI   ¹   s    zAlbertAttention.__init__c                    sò   t |ƒdkrd S t | j| j¡}t|ƒ| j }|D ](‰ ˆ t‡ fdd„| jD ƒƒ ‰ d|ˆ < q2| d¡ 	¡  
d¡}t t |ƒ¡|  ¡ }t| j|ƒ| _t| j|ƒ| _t| j|ƒ| _t| j|dd| _| jt |ƒ | _| j| j | _| j |¡| _d S )Nr   c                 3   s   | ]}|ˆ k rd ndV  qdS )r   r   NrC   )Ú.0Úh©ÚheadrC   rD   Ú	<genexpr>Ì   s     z.AlbertAttention.prune_heads.<locals>.<genexpr>éÿÿÿÿr   ©Zdim)r+   r6   Úonesr\   r^   rc   rd   ÚsumÚviewÚ
contiguousÚeqZarangeÚlongr   ÚqueryÚkeyÚvaluerb   Zall_head_sizeÚunion)rQ   ÚheadsÚmaskÚindexrC   rg   rD   Úprune_headsÅ   s     
zAlbertAttention.prune_headsNc                 C   s.  |   |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}	t || dd¡¡}
|
t | j	¡ }
|d k	rp|
| }
t
jdd|
ƒ}|  |¡}|d k	rš|| }t ||	¡}| dddd¡ ¡ }| jj ¡  | j| j	| j¡ |j¡}| jj |j¡}t d||¡| }|  |¡}|  || ¡}| jr(||fS |fS )	Nrj   éþÿÿÿrk   r   r   r   é   zbfnd,ndh->bfh)rr   rs   rt   Ztranspose_for_scoresr6   Úmatmulr2   ÚmathÚsqrtr^   rJ   ZSoftmaxr`   Zpermutero   rb   r   Útrn   r\   r]   ÚtoÚdtyper   Zeinsumr   r[   )rQ   Ú	input_idsÚattention_maskÚ	head_maskZmixed_query_layerZmixed_key_layerZmixed_value_layerZquery_layerZ	key_layerZvalue_layerZattention_scoresZattention_probsZcontext_layerÚwÚbZprojected_context_layerZprojected_context_layer_dropoutZlayernormed_context_layerrC   rC   rD   ÚforwardÜ   s:    






  ÿþÿ
zAlbertAttention.forward)NN)rT   rU   rV   rI   ry   r‡   rX   rC   rC   rR   rD   rY   ¸   s   rY   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚAlbertLayerc                    sb   t ƒ  ¡  || _tj|j|jd| _t|ƒ| _	t 
|j|j¡| _t 
|j|j¡| _t|j | _d S rZ   )rH   rI   r9   rJ   r   r]   rO   r   rY   r   ra   Zintermediate_sizer   Ú
ffn_outputr   Ú
hidden_actÚ
activationrP   rR   rC   rD   rI   
  s    

zAlbertLayer.__init__Nc                 C   sT   |   |||¡}|  |d ¡}|  |¡}|  |¡}|  ||d  ¡}|f|dd …  S )Nr   r   )r   r   r‹   r‰   r   )rQ   Úhidden_statesrƒ   r„   Zattention_outputr‰   rC   rC   rD   r‡     s    

zAlbertLayer.forward)NN©rT   rU   rV   rI   r‡   rX   rC   rC   rR   rD   rˆ   	  s   
rˆ   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚAlbertLayerGroupc                    s>   t ƒ  ¡  ˆ j| _ˆ j| _t ‡ fdd„tˆ jƒD ƒ¡| _d S )Nc                    s   g | ]}t ˆ ƒ‘qS rC   )rˆ   ©re   Ú_©r9   rC   rD   Ú
<listcomp>$  s     z-AlbertLayerGroup.__init__.<locals>.<listcomp>)	rH   rI   r[   Úoutput_hidden_statesrJ   Ú
ModuleListÚrangeÚinner_group_numÚalbert_layersrP   rR   r‘   rD   rI     s    
zAlbertLayerGroup.__init__Nc           
      C   s‚   d}d}t | jƒD ]D\}}||||| ƒ}|d }| jrF||d f }| jr||f }q|f}	| jrn|	|f }	| jr~|	|f }	|	S )NrC   r   r   )Ú	enumerater—   r[   r“   )
rQ   rŒ   rƒ   r„   Zlayer_hidden_statesZlayer_attentionsZlayer_indexZalbert_layerZlayer_outputÚoutputsrC   rC   rD   r‡   &  s    

zAlbertLayerGroup.forward)NNr   rC   rC   rR   rD   rŽ     s   rŽ   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚAlbertTransformerc                    sV   t ƒ  ¡  ˆ | _ˆ j| _ˆ j| _t ˆ jˆ j¡| _	t 
‡ fdd„tˆ jƒD ƒ¡| _d S )Nc                    s   g | ]}t ˆ ƒ‘qS rC   )rŽ   r   r‘   rC   rD   r’   D  s     z.AlbertTransformer.__init__.<locals>.<listcomp>)rH   rI   r9   r[   r“   rJ   ra   rM   r]   Úembedding_hidden_mapping_inr”   r•   Únum_hidden_groupsÚalbert_layer_groupsrP   rR   r‘   rD   rI   =  s    
zAlbertTransformer.__init__Nc                 C   sÒ   |   |¡}d}| jr|f}t| jjƒD ]€}t| jj| jj ƒ}t|| jj| jj  ƒ}| j| ||||| |d | … ƒ}	|	d }| jr–||	d  }| jr&||f }q&|f}
| jr¾|
|f }
| jrÎ|
|f }
|
S )NrC   r   r   rj   )	r›   r“   r•   r9   Únum_hidden_layersr1   rœ   r   r[   )rQ   rŒ   rƒ   r„   Zall_attentionsZall_hidden_statesÚiZlayers_per_groupÚ	group_idxZlayer_group_outputr™   rC   rC   rD   r‡   F  s.    
ý

zAlbertTransformer.forward)NNr   rC   rC   rR   rD   rš   <  s   	rš   c                   @   s$   e Zd ZdZeZeZdZdd„ Z	dS )ÚAlbertPreTrainedModelz† An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    Úalbertc                 C   sv   t |tjtjfƒrL|jjjd| jjd t |tjƒrr|j	dk	rr|j	j 
¡  n&t |tjƒrr|j	j 
¡  |jj d¡ dS )z! Initialize the weights.
        g        )ZmeanZstdNç      ð?)Ú
isinstancerJ   ra   rK   r   r7   Znormal_r9   Zinitializer_ranger   Zzero_r   Zfill_)rQ   ÚmodulerC   rC   rD   Ú_init_weightss  s    z#AlbertPreTrainedModel._init_weightsN)
rT   rU   rV   rW   r   Úconfig_classÚ#ALBERT_PRETRAINED_MODEL_ARCHIVE_MAPÚpretrained_model_archive_mapÚbase_model_prefixr¦   rC   rC   rC   rD   r¡   j  s
   r¡   aj  

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Args:
        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a½	  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.AlbertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
z`The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.c                       s^   e Zd ZeZeZeZdZ	‡ fdd„Z
dd„ Zdd„ Zdd	„ Zd
d„ Zeeƒddd„ƒZ‡  ZS )ÚAlbertModelr¢   c                    sN   t ƒ  |¡ || _t|ƒ| _t|ƒ| _t |j	|j	¡| _
t ¡ | _|  ¡  d S ©N)rH   rI   r9   rF   r   rš   ÚencoderrJ   ra   r]   r   ZTanhÚpooler_activationÚinit_weightsrP   rR   rC   rD   rI   ¾  s    


zAlbertModel.__init__c                 C   s   | j jS r¬   ©r   rN   ©rQ   rC   rC   rD   Úget_input_embeddingsÉ  s    z AlbertModel.get_input_embeddingsc                 C   s   || j _d S r¬   r°   )rQ   rt   rC   rC   rD   Úset_input_embeddingsÌ  s    z AlbertModel.set_input_embeddingsc                 C   s$   | j j}|  ||¡}|| j _| j jS r¬   )r   rN   Z_get_resized_embeddings)rQ   Znew_num_tokensZold_embeddingsZnew_embeddingsrC   rC   rD   Ú_resize_token_embeddingsÏ  s    z$AlbertModel._resize_token_embeddingsc                 C   sT   |  ¡ D ]F\}}t|| jj ƒ}t||| jj  ƒ}| jj| j| j |¡ qdS )aþ   Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
            If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
            is a total of 4 different layers.

            These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
            while [2,3] correspond to the two inner groups of the second hidden layer.

            Any layer with in index other than [0,1,2,3] will result in an error.
            See base class PreTrainedModel for more information about head pruning
        N)	Úitemsr1   r9   r–   r­   r   r—   r   ry   )rQ   Zheads_to_pruneZlayerrv   r    Zinner_group_idxrC   rC   rD   Ú_prune_headsÕ  s    zAlbertModel._prune_headsNc                 C   s2  |dk	r|dk	rt dƒ‚n4|dk	r,| ¡ }n"|dk	rF| ¡ dd… }nt dƒ‚|dk	r\|jn|j}|dkrxtj||d}|dkr’tj|tj|d}| d¡ d¡}	|	jt	|  
¡ ƒjd	}	d
|	 d }	|  || jj¡}| j||||d}
| j|
|	|d}|d }|  |  |dd…df ¡¡}||f|dd…  }|S )a	  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
            Last layer hidden-state of the first token of the sequence (classification token)
            further processed by a Linear layer and a Tanh activation function. The Linear
            layer weights are trained from the next sentence prediction (classification)
            objective during pre-training.

            This output is usually *not* a good summary
            of the semantic content of the input, you're often better with averaging or pooling
            the sequence of hidden-states for the whole input sequence.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Example::

        from transformers import AlbertModel, AlbertTokenizer
        import torch

        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = AlbertModel.from_pretrained('albert-base-v2')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        NzDYou cannot specify both input_ids and inputs_embeds at the same timerj   z5You have to specify either input_ids or inputs_embeds)Údevice)r   r·   r   r   )r   r£   g     ˆÃÀ)Úposition_idsÚtoken_type_idsÚinputs_embeds)r„   r   )Ú
ValueErrorÚsizer·   r6   rl   Úzerosrq   Z	unsqueezer€   ÚnextÚ
parametersr   Zget_head_maskr9   rž   r   r­   r®   r   )rQ   r‚   rƒ   r¹   r¸   r„   rº   Zinput_shaper·   Zextended_attention_maskZembedding_outputZencoder_outputsÚsequence_outputÚpooled_outputr™   rC   rC   rD   r‡   ç  s:    1

   ÿÿ
zAlbertModel.forward)NNNNNN)rT   rU   rV   r   r§   r¨   r©   rE   Zload_tf_weightsrª   rI   r²   r³   r´   r¶   r   ÚALBERT_INPUTS_DOCSTRINGr‡   rX   rC   rC   rR   rD   r«   ³  s"         ùr«   z£Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
    a `sentence order prediction (classification)` head. c                	       s>   e Zd Z‡ fdd„Zdd„ Zdd„ Zeeƒd
dd	„ƒZ‡  Z	S )ÚAlbertForPreTrainingc                    s>   t ƒ  |¡ t|ƒ| _t|ƒ| _t|ƒ| _|  ¡  |  	¡  d S r¬   )
rH   rI   r«   r¢   ÚAlbertMLMHeadr   ÚAlbertSOPHeadÚsop_classifierr¯   Útie_weightsrP   rR   rC   rD   rI   B  s    


zAlbertForPreTraining.__init__c                 C   s   |   | jj| jjj¡ d S r¬   ©Z_tie_or_clone_weightsr   Údecoderr¢   r   rN   r±   rC   rC   rD   rÇ   L  s    z AlbertForPreTraining.tie_weightsc                 C   s   | j jS r¬   ©r   rÉ   r±   rC   rC   rD   Úget_output_embeddingsO  s    z*AlbertForPreTraining.get_output_embeddingsNc	                 C   s®   | j ||||||d}	|	dd… \}
}|  |
¡}|  |¡}||f|	dd…  }	|dk	rª|dk	rªtƒ }|| d| jj¡| d¡ƒ}|| dd¡| d¡ƒ}|| }|f|	 }	|	S )a  
        masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
            Labels for computing the masked language modeling loss.
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
            in ``[0, ..., config.vocab_size]``
        sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
            Indices should be in ``[0, 1]``.
            ``0`` indicates original order (sequence A, then sequence B),
            ``1`` indicates switched order (sequence B, then sequence A).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        sop_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False
            continuation before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.


    Examples::

        from transformers import AlbertTokenizer, AlbertForPreTraining
        import torch

        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = AlbertForPreTraining.from_pretrained('albert-base-v2')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)

        prediction_scores, sop_scores = outputs[:2]

        ©rƒ   r¹   r¸   r„   rº   Nr   rj   )r¢   r   rÆ   r   rn   r9   rL   )rQ   r‚   rƒ   r¹   r¸   r„   rº   Úmasked_lm_labelsZsentence_order_labelr™   rÀ   rÁ   Úprediction_scoresZ
sop_scoresÚloss_fctÚmasked_lm_lossZsentence_order_lossÚ
total_lossrC   rC   rD   r‡   R  s&    =ú	


zAlbertForPreTraining.forward)NNNNNNNN©
rT   rU   rV   rI   rÇ   rË   r   rÂ   r‡   rX   rC   rC   rR   rD   rÃ   <  s   
        ÷rÃ   c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )rÄ   c                    sj   t ƒ  ¡  t |j¡| _t t |j¡¡| _	t 
|j|j¡| _t 
|j|j¡| _t|j | _| j	| j_	d S r¬   )rH   rI   rJ   r   rM   Ú	Parameterr6   r½   rL   r   ra   r]   rb   rÉ   r   rŠ   r‹   rP   rR   rC   rD   rI   ª  s    
zAlbertMLMHead.__init__c                 C   s0   |   |¡}|  |¡}|  |¡}|  |¡}|}|S r¬   )rb   r‹   r   rÉ   )rQ   rŒ   rÎ   rC   rC   rD   r‡   ¶  s    



zAlbertMLMHead.forwardr   rC   rC   rR   rD   rÄ   ©  s   rÄ   c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )rÅ   c                    s.   t ƒ  ¡  t |j¡| _t |j|j¡| _	d S r¬   )
rH   rI   rJ   r_   Úclassifier_dropout_probr`   ra   r]   Ú
num_labelsr   rP   rR   rC   rD   rI   Â  s    
zAlbertSOPHead.__init__c                 C   s   |   |¡}|  |¡}|S r¬   )r`   r   )rQ   rÁ   Zdropout_pooled_outputÚlogitsrC   rC   rD   r‡   È  s    

zAlbertSOPHead.forwardr   rC   rC   rR   rD   rÅ   Á  s   rÅ   z4Albert Model with a `language modeling` head on top.c                       s>   e Zd Z‡ fdd„Zdd„ Zdd„ Zeeƒd
dd	„ƒZ‡  Z	S )ÚAlbertForMaskedLMc                    s4   t ƒ  |¡ t|ƒ| _t|ƒ| _|  ¡  |  ¡  d S r¬   )rH   rI   r«   r¢   rÄ   r   r¯   rÇ   rP   rR   rC   rD   rI   Ò  s
    

zAlbertForMaskedLM.__init__c                 C   s   |   | jj| jjj¡ d S r¬   rÈ   r±   rC   rC   rD   rÇ   Û  s    zAlbertForMaskedLM.tie_weightsc                 C   s   | j jS r¬   rÊ   r±   rC   rC   rD   rË   Þ  s    z'AlbertForMaskedLM.get_output_embeddingsNc                 C   sr   | j ||||||d}|d }	|  |	¡}
|
f|dd…  }|dk	rntƒ }||
 d| jj¡| d¡ƒ}|f| }|S )a&	  
        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the masked language modeling loss.
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
            labels in ``[0, ..., config.vocab_size]``

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Masked language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Example::

        from transformers import AlbertTokenizer, AlbertForMaskedLM
        import torch

        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, masked_lm_labels=input_ids)
        loss, prediction_scores = outputs[:2]

        ©r‚   rƒ   r¹   r¸   r„   rº   r   r   Nrj   )r¢   r   r   rn   r9   rL   )rQ   r‚   rƒ   r¹   r¸   r„   rº   rÍ   r™   Zsequence_outputsrÎ   rÏ   rÐ   rC   rC   rD   r‡   á  s     0ú

zAlbertForMaskedLM.forward)NNNNNNNrÒ   rC   rC   rR   rD   r×   Î  s   	       ør×   z•Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s.   e Zd Z‡ fdd„Zeeƒddd„ƒZ‡  ZS )ÚAlbertForSequenceClassificationc                    sL   t ƒ  |¡ |j| _t|ƒ| _t |j¡| _t 	|j
| jj¡| _|  ¡  d S r¬   )rH   rI   rÕ   r«   r¢   rJ   r_   rÔ   r`   ra   r]   r9   r   r¯   rP   rR   rC   rD   rI   ,  s    
z(AlbertForSequenceClassification.__init__Nc                 C   s¢   | j ||||||d}|d }	|  |	¡}	|  |	¡}
|
f|dd…  }|dk	rž| jdkrttƒ }||
 d¡| d¡ƒ}n tƒ }||
 d| j¡| d¡ƒ}|f| }|S )a3	  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
        loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

        Examples::

            from transformers import AlbertTokenizer, AlbertForSequenceClassification
            import torch

            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
            model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
            outputs = model(input_ids, labels=labels)
            loss, logits = outputs[:2]

        rØ   r   r   Nrj   )r¢   r`   r   rÕ   r   rn   r   )rQ   r‚   rƒ   r¹   r¸   r„   rº   Úlabelsr™   rÁ   rÖ   rÏ   ÚlossrC   rC   rD   r‡   6  s(    2ú	



z'AlbertForSequenceClassification.forward)NNNNNNN©rT   rU   rV   rI   r   rÂ   r‡   rX   rC   rC   rR   rD   rÙ   &  s   
       ørÙ   zœAlbert Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. c                       s.   e Zd Z‡ fdd„Zeeƒddd„ƒZ‡  ZS )ÚAlbertForTokenClassificationc                    sL   t ƒ  |¡ |j| _t|ƒ| _t |j¡| _t 	|j
| jj¡| _|  ¡  d S r¬   )rH   rI   rÕ   r«   r¢   rJ   r_   Zhidden_dropout_probr`   ra   r]   r9   r   r¯   rP   rR   rC   rD   rI   ‹  s    
z%AlbertForTokenClassification.__init__Nc                 C   s¼   | j ||||||d}|d }	|  |	¡}	|  |	¡}
|
f|dd…  }|dk	r¸tƒ }|dk	r”| d¡dk}|
 d| j¡| }| d¡| }|||ƒ}n||
 d| j¡| d¡ƒ}|f| }|S )aR  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the token classification loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
            Classification loss.
        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import AlbertTokenizer, AlbertForTokenClassification
        import torch

        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = AlbertForTokenClassification.from_pretrained('albert-base-v2')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, scores = outputs[:2]

        rÌ   r   r   Nrj   r   )r¢   r`   r   r   rn   rÕ   )rQ   r‚   rƒ   r¹   r¸   r„   rº   rÚ   r™   rÀ   rÖ   rÏ   Zactive_lossZactive_logitsZactive_labelsrÛ   rC   rC   rD   r‡   •  s,    2ú	


z$AlbertForTokenClassification.forward)NNNNNNNrÜ   rC   rC   rR   rD   rÝ   …  s   
       ørÝ   zÖAlbert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                	       s.   e Zd Z‡ fdd„Zeeƒddd„ƒZ‡  ZS )ÚAlbertForQuestionAnsweringc                    s<   t ƒ  |¡ |j| _t|ƒ| _t |j|j¡| _|  	¡  d S r¬   )
rH   rI   rÕ   r«   r¢   rJ   ra   r]   Ú
qa_outputsr¯   rP   rR   rC   rD   rI   ì  s
    
z#AlbertForQuestionAnswering.__init__Nc	                 C   s   | j ||||||d}	|	d }
|  |
¡}|jddd\}}| d¡}| d¡}||f|	dd…  }	|dk	rü|dk	rüt| ¡ ƒdkrŒ| d¡}t| ¡ ƒdkr¦| d¡}| d¡}| d|¡ | d|¡ t|d}|||ƒ}|||ƒ}|| d }|f|	 }	|	S )	a¹  
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
        loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_scores ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
            Span-start scores (before SoftMax).
        end_scores: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
            Span-end scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.

        from transformers import AlbertTokenizer, AlbertForQuestionAnswering
        import torch

        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
        input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
        start_scores, end_scores = model(**input_dict)

        rØ   r   r   rj   rk   r   N)Zignore_index)r¢   rß   r,   Zsqueezer+   r¼   Zclamp_r   )rQ   r‚   rƒ   r¹   r¸   r„   rº   Zstart_positionsZend_positionsr™   rÀ   rÖ   Zstart_logitsZ
end_logitsZignored_indexrÏ   Z
start_lossZend_lossrÑ   rC   rC   rD   r‡   õ  s8    :ú	









z"AlbertForQuestionAnswering.forward)NNNNNNNNrÜ   rC   rC   rR   rD   rÞ   æ  s   	        ÷rÞ   )+rW   Úloggingr}   r"   r6   Ztorch.nnrJ   r   r   Zconfiguration_albertr   Z
file_utilsr   r   Zmodeling_bertr   r	   r
   r   Zmodeling_utilsr   Ú	getLoggerrT   r    r¨   rE   rF   rY   ÚModulerˆ   rŽ   rš   r¡   ZALBERT_START_DOCSTRINGrÂ   r«   rÃ   rÄ   rÅ   r×   rÙ   rÝ   rÞ   rC   rC   rC   rD   Ú<module>   s|   
ø|Q.&þ ýh ÿUýZý\ý