U
    &c                  	   @   s  d Z ddlZddlZddlmZ ddlmZmZm	Z	 ddl
mZmZ ddlmZmZmZmZ ddlmZ eeZd	d
dddddddZG dd dejjjZG dd dejjjZG dd dejjjZG dd deZG dd dejjjZG dd dejjjZ G dd dejjjZ!G d d! d!eZ"G d"d# d#ejjjZ#eG d$d% d%ejjjZ$d&Z%d'Z&ed(e%G d)d* d*e"Z'ed+e%G d,d- d-e"Z(G d.d/ d/ejjjZ)ed0e%G d1d2 d2e"Z*ed3e%G d4d5 d5e"Z+ed6e%G d7d8 d8e"Z,ed9e%G d:d; d;e"Z-dS )<z TF 2.0 ALBERT model.     N   )AlbertConfig)MULTIPLE_CHOICE_DUMMY_INPUTSadd_start_docstrings add_start_docstrings_to_callable)ACT2FNTFBertSelfAttention)TFPreTrainedModelget_initializerkeras_serializable
shape_list)BatchEncodingzAhttps://cdn.huggingface.co/albert-base-v1-with-prefix-tf_model.h5zBhttps://cdn.huggingface.co/albert-large-v1-with-prefix-tf_model.h5zChttps://cdn.huggingface.co/albert-xlarge-v1-with-prefix-tf_model.h5zDhttps://cdn.huggingface.co/albert-xxlarge-v1-with-prefix-tf_model.h5zAhttps://cdn.huggingface.co/albert-base-v2-with-prefix-tf_model.h5zBhttps://cdn.huggingface.co/albert-large-v2-with-prefix-tf_model.h5zChttps://cdn.huggingface.co/albert-xlarge-v2-with-prefix-tf_model.h5zDhttps://cdn.huggingface.co/albert-xxlarge-v2-with-prefix-tf_model.h5)zalbert-base-v1zalbert-large-v1zalbert-xlarge-v1zalbert-xxlarge-v1zalbert-base-v2zalbert-large-v2zalbert-xlarge-v2zalbert-xxlarge-v2c                       sH   e Zd ZdZ fddZ fddZddd	Zdd
dZdd Z  Z	S )TFAlbertEmbeddingszLConstruct the embeddings from word, position and token_type embeddings.
    c                    s   t  jf | || _tjjj|j|jt	| jj
dd| _tjjj|j|jt	| jj
dd| _tjjj|jdd| _tjj|j| _d S )Nposition_embeddings)Zembeddings_initializernametoken_type_embeddings	LayerNormepsilonr   )super__init__configtfkeraslayersZ	EmbeddingZmax_position_embeddingsembedding_sizer
   initializer_ranger   Ztype_vocab_sizer   LayerNormalizationlayer_norm_epsr   Dropouthidden_dropout_probdropoutselfr   kwargs	__class__ C/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_albert.pyr   0   s     

	zTFAlbertEmbeddings.__init__c              	      sL   t d, | jd| jj| jjgt| jjd| _W 5 Q R X t	 
| dS )z"Build shared word embedding layer word_embeddingsZweight)shapeinitializerN)r   Z
name_scope
add_weightr   
vocab_sizer   r
   r   r)   r   buildr#   input_shaper%   r'   r(   r.   F   s    
zTFAlbertEmbeddings.build	embeddingFc                 C   s:   |dkr| j ||dS |dkr(| |S td|dS )a	  Get token embeddings of inputs.
        Args:
            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
            mode: string, a valid value is one of "embedding" and "linear".
        Returns:
            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
                linear tensor, float32 with shape [batch_size, length, vocab_size].
        Raises:
            ValueError: if mode is not valid.

        Shared weights logic adapted from
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        r1   traininglinearzmode {} is not valid.N)
_embedding_linear
ValueErrorformat)r#   inputsmoder3   r'   r'   r(   callR   s
    
zTFAlbertEmbeddings.callc                 C   s   |\}}}}|dk	rt |}nt |dd }|d }|dkr\tj|tjdtjddf }|dkrpt|d}|dkrt| j|}| |}	| 	|}
||	 |
 }| 
|}| j||d}|S )z)Applies embedding based on inputs tensor.Nr   )Zdtyper   r2   )r   r   rangeZint32newaxisfillZgatherr)   r   r   r   r!   )r#   r9   r3   	input_idsposition_idstoken_type_idsinputs_embedsr0   
seq_lengthr   r   
embeddingsr'   r'   r(   r5   h   s"    



zTFAlbertEmbeddings._embeddingc                 C   sT   t |d }t |d }t|d| jjg}tj|| jdd}t|||| jjgS )a  Computes logits by running inputs through a linear layer.
            Args:
                inputs: A float32 tensor with shape [batch_size, length, embedding_size]
            Returns:
                float32 tensor with shape [batch_size, length, vocab_size].
        r   r   r<   TZtranspose_b)r   r   reshaper   r   matmulr)   r-   )r#   r9   
batch_sizelengthxlogitsr'   r'   r(   r6      s
    zTFAlbertEmbeddings._linear)r1   F)F)
__name__
__module____qualname____doc__r   r.   r;   r5   r6   __classcell__r'   r'   r%   r(   r   ,   s   

r   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	TFAlbertSelfAttentionc                    s   t  jf | |j|j dkr2td|j|jf |j| _|j| _|j|j dksVtt|j|j | _| j| j | _	t
jjj| j	t|jdd| _t
jjj| j	t|jdd| _t
jjj| j	t|jdd| _t
jj|j| _d S )Nr   zLThe hidden size (%d) is not a multiple of the number of attention heads (%d)querykernel_initializerr   keyvalue)r   r   hidden_sizenum_attention_headsr7   output_attentionsAssertionErrorintattention_head_sizeall_head_sizer   r   r   Denser
   r   rS   rV   rW   r   Zattention_probs_dropout_probr!   r"   r%   r'   r(   r      s8    
      zTFAlbertSelfAttention.__init__c                 C   s.   t ||d| j| jf}t j|ddddgdS )Nr<   r      r      perm)r   rG   rY   r]   	transpose)r#   rK   rI   r'   r'   r(   transpose_for_scores   s    z*TFAlbertSelfAttention.transpose_for_scoresFc                 C   s  |\}}}t |d }| |}| |}| |}	| ||}
| ||}| |	|}tj|
|dd}tt |d tj}|tj	
| }|d k	r|| }tjj|dd}| j||d}|d k	r|| }t||}tj|dddd	gd
}t||d| jf}| jr||fn|f}|S Nr   TrF   r<   Zaxisr2   r`   r   ra   rb   )r   rS   rV   rW   re   r   rH   castfloat32mathsqrtnnsoftmaxr!   rd   rG   r^   rZ   )r#   r9   r3   hidden_statesattention_mask	head_maskrI   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresdkattention_probscontext_layeroutputsr'   r'   r(   r;      s2    



 
zTFAlbertSelfAttention.call)F)rM   rN   rO   r   re   r;   rQ   r'   r'   r%   r(   rR      s   rR   c                       s&   e Zd Z fddZdddZ  ZS )TFAlbertSelfOutputc                    sX   t  jf | tjjj|jt|jdd| _	tjjj
|jdd| _tjj|j| _d S NdenserT   r   r   )r   r   r   r   r   r_   rX   r
   r   r~   r   r   r   r   r    r!   r"   r%   r'   r(   r      s      zTFAlbertSelfOutput.__init__Fc                 C   s2   |\}}|  |}| j||d}| || }|S Nr2   )r~   r!   r   )r#   r9   r3   rn   input_tensorr'   r'   r(   r;      s
    
zTFAlbertSelfOutput.call)FrM   rN   rO   r   r;   rQ   r'   r'   r%   r(   r|      s   r|   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	TFAlbertAttentionc                    sX   t  j|f| |j| _tjjj|jt|jdd| _	tjjj
|jdd| _t | _d S r}   )r   r   rX   r   r   r   r_   r
   r   r~   r   r   r   setZpruned_headsr"   r%   r'   r(   r      s      zTFAlbertAttention.__init__c                 C   s   t d S NNotImplementedError)r#   Zheadsr'   r'   r(   prune_heads   s    zTFAlbertAttention.prune_headsFc                 C   s\  |\}}}t |d }| |}| |}| |}	| ||}
| ||}| |	|}tj|
|dd}tt |d tj}|tj	
| }|d k	r|| }tjj|dd}| j||d}|d k	r|| }t||}tj|dddd	gd
}t||d| jf}| jr||fn|f}|d }| |}| j||d}| || }|f|dd   }|S rf   )r   rS   rV   rW   re   r   rH   rh   ri   rj   rk   rl   rm   r!   rd   rG   r^   rZ   r~   r   )r#   r9   r3   r   ro   rp   rI   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   Zself_outputsrn   Zattention_outputr{   r'   r'   r(   r;      s<    



 

zTFAlbertAttention.call)F)rM   rN   rO   r   r   r;   rQ   r'   r'   r%   r(   r      s   
r   c                       s&   e Zd Z fddZdddZ  ZS )TFAlbertLayerc                    s   t  jf | t|dd| _tjjj|jt	|j
dd| _t|jtrTt|j | _n|j| _tjjj|jt	|j
dd| _tjjj|jdd| _tjj|j| _d S )N	attentionr   ffnrT   
ffn_outputfull_layer_layer_normr   )r   r   r   r   r   r   r   r_   Zintermediate_sizer
   r   r   
isinstance
hidden_actstrr   
activationrX   r   r   r   r   r   r    r!   r"   r%   r'   r(   r   2  s(         zTFAlbertLayer.__init__Fc           	      C   sv   |\}}}| j |||g|d}| |d }| |}| |}| j||d}| ||d  }|f|dd   }|S )Nr2   r   r   )r   r   r   r   r!   r   )	r#   r9   r3   rn   ro   rp   Zattention_outputsr   r{   r'   r'   r(   r;   G  s    


zTFAlbertLayer.call)Fr   r'   r'   r%   r(   r   1  s   r   c                       s&   e Zd Z fddZdddZ  ZS )TFAlbertLayerGroupc                    s<   t  jf |  j| _ j| _ fddt jD | _d S )Nc                    s   g | ]}t  d |dqS )zalbert_layers_._{}r   )r   r8   .0ir   r'   r(   
<listcomp>]  s    z/TFAlbertLayerGroup.__init__.<locals>.<listcomp>)r   r   rZ   output_hidden_statesr=   Zinner_group_numalbert_layersr"   r%   r   r(   r   X  s    
zTFAlbertLayerGroup.__init__Fc                 C   s   |\}}}d}d}t | jD ]J\}}	|	|||| g|d}
|
d }| jrV||
d f }| jr||f }q|f}| jr~||f }| jr||f }|S )Nr'   r2   r   r   )	enumerater   rZ   r   )r#   r9   r3   rn   ro   rp   Zlayer_hidden_statesZlayer_attentionsZlayer_indexZalbert_layerZlayer_outputr{   r'   r'   r(   r;   a  s     


zTFAlbertLayerGroup.call)Fr   r'   r'   r%   r(   r   W  s   	r   c                       s&   e Zd Z fddZdddZ  ZS )TFAlbertTransformerc                    s`   t  jf |  | _ j| _ j| _tjjj j	t
 jdd| _ fddt jD | _d S )Nembedding_hidden_mapping_inrT   c                    s   g | ]}t  d |dqS )zalbert_layer_groups_._{}r   )r   r8   r   r   r'   r(   r     s   z0TFAlbertTransformer.__init__.<locals>.<listcomp>)r   r   r   rZ   r   r   r   r   r_   rX   r
   r   r   r=   num_hidden_groupsalbert_layer_groupsr"   r%   r   r(   r   {  s    
zTFAlbertTransformer.__init__Fc                 C   s   |\}}}|  |}d}| jr$|f}t| jjD ]}t| jj| jj }	t|| jj| jj  }
| j|
 ||||
|	 |
d |	  g|d}|d }| jr||d  }| jr0||f }q0|f}| jr||f }| jr||f }|S )Nr'   r   r2   r   r<   )	r   r   r=   r   num_hidden_layersr\   r   r   rZ   )r#   r9   r3   rn   ro   rp   Zall_attentionsZall_hidden_statesr   Zlayers_per_groupZ	group_idxZlayer_group_outputr{   r'   r'   r(   r;     s4    



zTFAlbertTransformer.call)Fr   r'   r'   r%   r(   r   z  s   r   c                   @   s   e Zd ZdZeZeZdZdS )TFAlbertPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    albertN)	rM   rN   rO   rP   r   config_class&TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixr'   r'   r'   r(   r     s   r   c                       s0   e Zd Z fddZ fddZdd Z  ZS )TFAlbertMLMHeadc                    sv   t  jf | |j| _tjjj|jt|j	dd| _
t|jtrNt|j | _n|j| _tjjj|jdd| _|| _d S r}   )r   r   r-   r   r   r   r_   r   r
   r   r~   r   r   r   r   r   r   r   r   decoder)r#   r   Zinput_embeddingsr$   r%   r'   r(   r     s      zTFAlbertMLMHead.__init__c                    s@   | j | jfdddd| _| j | jfdddd| _t | d S )NzerosTbias)r*   r+   Z	trainabler   zdecoder/bias)r,   r-   r   decoder_biasr   r.   r/   r%   r'   r(   r.     s       zTFAlbertMLMHead.buildc                 C   s6   |  |}| |}| |}| j|dd| j }|S )Nr4   )r:   )r~   r   r   r   r   )r#   rn   r'   r'   r(   r;     s
    


zTFAlbertMLMHead.call)rM   rN   rO   r   r.   r;   rQ   r'   r'   r%   r(   r     s   r   c                       sB   e Zd ZeZ fddZdd Zdd Zdd ZdddZ	  Z
S )TFAlbertMainLayerc                    sV   t  jf | |j| _t|dd| _t|dd| _tjj	j
|jt|jddd| _d S )NrE   r   encodertanhpooler)rU   r   r   )r   r   r   r   rE   r   r   r   r   r   r_   rX   r
   r   r   r"   r%   r'   r(   r     s    zTFAlbertMainLayer.__init__c                 C   s   | j S r   )rE   r#   r'   r'   r(   get_input_embeddings  s    z&TFAlbertMainLayer.get_input_embeddingsc                 C   s   t d S r   r   )r#   Znew_num_tokensr'   r'   r(   _resize_token_embeddings  s    z*TFAlbertMainLayer._resize_token_embeddingsc                 C   s   t dS )z Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        Nr   )r#   Zheads_to_pruner'   r'   r(   _prune_heads  s    zTFAlbertMainLayer._prune_headsNFc                 C   sJ  t |ttfr|d }t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkstdnrt |ttfr|d	}|d
|}|d|}|d|}|d|}|d|}t|dkstdn|}|d k	r4|d k	r4tdn8|d k	rHt	|}	n$|d k	rdt	|d d }	ntd|d krt
|	d}|d krt
|	d}|d d t
jt
jd d f }
t
|
t
j}
d|
 d }
|d k	rtnd g| j }| j||||g|d}| j||
|g|d}|d }| |d d df }||f|dd   }|S )Nr   r   r`   ra            Too many inputs.r@   ro   rB   rA   rp   rC   zDYou cannot specify both input_ids and inputs_embeds at the same timer<   z5You have to specify either input_ids or inputs_embedsg      ?g     r2   )r   tuplelistlenr[   dictr   getr7   r   r   r?   r>   rh   ri   r   r   rE   r   r   )r#   r9   ro   rB   rA   rp   rC   r3   r@   r0   Zextended_attention_maskZembedding_outputZencoder_outputssequence_outputpooled_outputr{   r'   r'   r(   r;     sP    








zTFAlbertMainLayer.call)NNNNNF)rM   rN   rO   r   r   r   r   r   r   r;   rQ   r'   r'   r%   r(   r     s   
      r   a  
    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
    Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.

    .. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`:
        https://arxiv.org/abs/1909.11942

    .. _`tf.keras.Model`:
        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model

    .. note::

        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Args:
        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a
  
    Args:
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.AlbertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
        input_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
            (if set to :obj:`False`) for evaluation.
z_The bare Albert Model transformer outputing raw hidden-states without any specific head on top.c                       s,   e Zd Z fddZeedd Z  ZS )TFAlbertModelc                    s&   t  j|f|| t|dd| _d S )Nr   r   )r   r   r   r   r#   r   r9   r$   r%   r'   r(   r     s    zTFAlbertModel.__init__c                 K   s   | j |f|}|S )a1	  
        Returns:
            :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
            last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
                Sequence of hidden-states at the output of the last layer of the model.
            pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
                Last layer hidden-state of the first token of the sequence (classification token)
                further processed by a Linear layer and a Tanh activation function. The Linear
                layer weights are trained from the next sentence prediction (classification)
                objective during Albert pretraining. This output is usually *not* a good summary
                of the semantic content of the input, you're often better with averaging or pooling
                the sequence of hidden-states for the whole input sequence.
            hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
                tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
            attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
                tuple of :obj:`tf.Tensor` (one for each layer) of shape
                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

        Examples::

            import tensorflow as tf
            from transformers import AlbertTokenizer, TFAlbertModel

            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
            model = TFAlbertModel.from_pretrained('albert-base-v2')
            input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
            outputs = model(input_ids)
            last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        )r   )r#   r9   r$   r{   r'   r'   r(   r;     s    %zTFAlbertModel.callrM   rN   rO   r   r   ALBERT_INPUTS_DOCSTRINGr;   rQ   r'   r'   r%   r(   r     s   r   zAlbert Model with two heads on top for pre-training:
    a `masked language modeling` head and a `sentence order prediction` (classification) head. c                       s4   e Zd Z fddZdd Zeedd Z  ZS )TFAlbertForPreTrainingc                    sP   t  j|f|| |j| _t|dd| _t|| jjdd| _t|dd| _	d S )Nr   r   predictionssop_classifier)
r   r   
num_labelsr   r   r   rE   r   TFAlbertSOPHeadr   r   r%   r'   r(   r     s
    zTFAlbertForPreTraining.__init__c                 C   s   | j jS r   r   rE   r   r'   r'   r(   get_output_embeddings  s    z,TFAlbertForPreTraining.get_output_embeddingsc                 K   sV   | j |f|}|dd \}}| |}| j||ddd}||f|dd  }|S )aH  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
            Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import AlbertTokenizer, TFAlbertForPreTraining
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        prediction_scores, sop_scores = outputs[:2]
        Nr`   r3   Fr2   )r   r   r   r   )r#   r9   r$   r{   r   r   prediction_scoresZ
sop_scoresr'   r'   r(   r;     s    
zTFAlbertForPreTraining.call	rM   rN   rO   r   r   r   r   r;   rQ   r'   r'   r%   r(   r     s   r   c                       s*   e Zd Z fddZedddZ  ZS )r   c                    sB   t  jf | tjj|j| _tjjj|j	t
|jdd| _d S )N
classifierrT   )r   r   r   r   r   r   classifier_dropout_probr!   r_   r   r
   r   r   r"   r%   r'   r(   r     s      zTFAlbertSOPHead.__init__r2   c                 C   s   | j ||d}| |}|S r   )r!   r   )r#   r   r3   Zdropout_pooled_outputrL   r'   r'   r(   r;     s    
zTFAlbertSOPHead.call)rM   rN   rO   r   boolr;   rQ   r'   r'   r%   r(   r     s   r   z5Albert Model with a `language modeling` head on top. c                       s4   e Zd Z fddZdd Zeedd Z  ZS )TFAlbertForMaskedLMc                    s:   t  j|f|| t|dd| _t|| jjdd| _d S )Nr   r   r   )r   r   r   r   r   rE   r   r   r%   r'   r(   r     s    zTFAlbertForMaskedLM.__init__c                 C   s   | j jS r   r   r   r'   r'   r(   r     s    z)TFAlbertForMaskedLM.get_output_embeddingsc                 K   sB   | j |f|}|d }| j||ddd}|f|dd  }|S )aX  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import AlbertTokenizer, TFAlbertForMaskedLM

        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        prediction_scores = outputs[0]

        r   r3   Fr2   r`   N)r   r   r   )r#   r9   r$   r{   r   r   r'   r'   r(   r;     s
    zTFAlbertForMaskedLM.callr   r'   r'   r%   r(   r     s   r   zAlbert Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s,   e Zd Z fddZeedd Z  ZS )!TFAlbertForSequenceClassificationc                    s^   t  j|f|| |j| _t|dd| _tjj|j	| _
tjjj|jt|jdd| _d S )Nr   r   r   rT   )r   r   r   r   r   r   r   r   r   r   r!   r_   r
   r   r   r   r%   r'   r(   r   M  s      z*TFAlbertForSequenceClassification.__init__c                 K   sL   | j |f|}|d }| j||ddd}| |}|f|dd  }|S )a;  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`)
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import AlbertTokenizer, TFAlbertForSequenceClassification

        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]

        r   r3   Fr2   r`   N)r   r!   r   r   )r#   r9   r$   r{   r   rL   r'   r'   r(   r;   W  s    
z&TFAlbertForSequenceClassification.callr   r'   r'   r%   r(   r   G  s   
r   zAlbert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). c                       s,   e Zd Z fddZeedd Z  ZS )TFAlbertForQuestionAnsweringc                    sL   t  j|f|| |j| _t|dd| _tjjj|jt	|j
dd| _d S )Nr   r   
qa_outputsrT   )r   r   r   r   r   r   r   r   r_   r
   r   r   r   r%   r'   r(   r     s      z%TFAlbertForQuestionAnswering.__init__c                 K   sh   | j |f|}|d }| |}tj|ddd\}}tj|dd}tj|dd}||f|dd  }|S )aA  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
        start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-start scores (before SoftMax).
        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.

        import tensorflow as tf
        from transformers import AlbertTokenizer, TFAlbertForQuestionAnswering

        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2')
        input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet")
        start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1

        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])

        r   r`   r<   rg   N)r   r   r   splitZsqueeze)r#   r9   r$   r{   r   rL   Zstart_logitsZ
end_logitsr'   r'   r(   r;     s    %
z!TFAlbertForQuestionAnswering.callr   r'   r'   r%   r(   r     s   	r   zAlbert Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. c                       s:   e Zd Z fddZedd Zeed	ddZ  Z	S )
TFAlbertForMultipleChoicec                    sT   t  j|f|| t|dd| _tjj|j| _	tjjj
dt|jdd| _d S )Nr   r   r   r   rT   )r   r   r   r   r   r   r   r   r    r!   r_   r
   r   r   r   r%   r'   r(   r     s      z"TFAlbertForMultipleChoice.__init__c                 C   s   dt tiS )zf Dummy inputs to build the network.

        Returns:
            tf.Tensor with dummy inputs
        r@   )r   Zconstantr   r   r'   r'   r(   dummy_inputs  s    z&TFAlbertForMultipleChoice.dummy_inputsNFc                 C   s:  t |ttfr|d }t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkstdn~t |trtd	 |d
}t| |d|}|d|}|d|}|d|}|d|}t|dks"tdn|}|dk	rFt|d }	t|d }
nt|d }	t|d }
|dk	rxt	
|d|
fnd}|dk	rt	
|d|
fnd}|dk	rt	
|d|
fnd}|dk	rt	
|d|
fnd}||||||g}| j||d}|d }| j||d}| |}t	
|d|	f}|f|dd  }|S )aN  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).

            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import AlbertTokenizer, TFAlbertForMultipleChoice

        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2')

        example1 = ["This is a context", "Is it a context? Yes"]
        example2 = ["This is a context", "Is it a context? No"]
        encoding = tokenizer.batch_encode_plus([example1, example2], return_tensors='tf', truncation_strategy="only_first", pad_to_max_length=True, max_length=128)
        outputs = model(encoding["input_ids"][None, :])
        logits = outputs[0]

        r   r   r`   ra   r   r   r   r   z	isdict(1)r@   ro   rB   rA   rp   rC   Nr<   r2   )r   r   r   r   r[   r   printr   r   r   rG   r   r!   r   )r#   r9   ro   rB   rA   rp   rC   r3   r@   Znum_choicesrD   Zflat_input_idsZflat_attention_maskZflat_token_type_idsZflat_position_idsZflat_inputsr{   r   rL   Zreshaped_logitsr'   r'   r(   r;     sT    ,

	
zTFAlbertForMultipleChoice.call)NNNNNF)
rM   rN   rO   r   propertyr   r   r   r;   rQ   r'   r'   r%   r(   r     s   	
      r   ).rP   loggingZ
tensorflowr   Zconfiguration_albertr   Z
file_utilsr   r   r   Zmodeling_tf_bertr   r   Zmodeling_tf_utilsr	   r
   r   r   Ztokenization_utilsr   	getLoggerrM   loggerr   r   r   ZLayerr   rR   r|   r   r   r   r   r   r   r   ZALBERT_START_DOCSTRINGr   r   r   r   r   r   r   r   r'   r'   r'   r(   <module>   st   
cLD&#;
"o$)./35=