U
    &c                     @   s  d Z ddlZddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZmZmZmZ ddlmZ eeZdd	d
ddddddddddddddddddZdd Zdd  Zd!d" Zejjeejjjejjeejjed#ZG d$d% d%ejjj Z!G d&d' d'ejjj Z"G d(d) d)ejjj Z#G d*d+ d+ejjj Z$G d,d- d-ejjj Z%G d.d/ d/ejjj Z&G d0d1 d1ejjj Z'G d2d3 d3ejjj Z(G d4d5 d5ejjj Z)G d6d7 d7ejjj Z*G d8d9 d9ejjj Z+G d:d; d;ejjj Z,G d<d= d=ejjj Z-eG d>d? d?ejjj Z.G d@dA dAeZ/dBZ0dCZ1e
dDe0G dEdF dFe/Z2e
dGe0G dHdI dIe/Z3e
dJe0G dKdL dLe/Z4e
dMe0G dNdO dOe/Z5e
dPe0G dQdR dRe/Z6e
dSe0G dTdU dUe/Z7e
dVe0G dWdX dXe/Z8e
dYe0G dZd[ d[e/Z9dS )\z TF 2.0 BERT model.     N   )
BertConfig)MULTIPLE_CHOICE_DUMMY_INPUTSadd_start_docstrings add_start_docstrings_to_callable)TFPreTrainedModelget_initializerkeras_serializable
shape_list)BatchEncodingz8https://cdn.huggingface.co/bert-base-uncased-tf_model.h5z9https://cdn.huggingface.co/bert-large-uncased-tf_model.h5z6https://cdn.huggingface.co/bert-base-cased-tf_model.h5z7https://cdn.huggingface.co/bert-large-cased-tf_model.h5zEhttps://cdn.huggingface.co/bert-base-multilingual-uncased-tf_model.h5zChttps://cdn.huggingface.co/bert-base-multilingual-cased-tf_model.h5z8https://cdn.huggingface.co/bert-base-chinese-tf_model.h5z=https://cdn.huggingface.co/bert-base-german-cased-tf_model.h5zLhttps://cdn.huggingface.co/bert-large-uncased-whole-word-masking-tf_model.h5zJhttps://cdn.huggingface.co/bert-large-cased-whole-word-masking-tf_model.h5z\https://cdn.huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5zZhttps://cdn.huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5zEhttps://cdn.huggingface.co/bert-base-cased-finetuned-mrpc-tf_model.h5zChttps://cdn.huggingface.co/cl-tohoku/bert-base-japanese/tf_model.h5zVhttps://cdn.huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/tf_model.h5zHhttps://cdn.huggingface.co/cl-tohoku/bert-base-japanese-char/tf_model.h5z[https://cdn.huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/tf_model.h5zJhttps://cdn.huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5zLhttps://cdn.huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5zEhttps://cdn.huggingface.co/wietsedv/bert-base-dutch-cased/tf_model.h5)zbert-base-uncasedzbert-large-uncasedzbert-base-casedzbert-large-casedzbert-base-multilingual-uncasedzbert-base-multilingual-casedzbert-base-chinesezbert-base-german-casedz%bert-large-uncased-whole-word-maskingz#bert-large-cased-whole-word-maskingz5bert-large-uncased-whole-word-masking-finetuned-squadz3bert-large-cased-whole-word-masking-finetuned-squadzbert-base-cased-finetuned-mrpczbert-base-japanesez%bert-base-japanese-whole-word-maskingzbert-base-japanese-charz*bert-base-japanese-char-whole-word-maskingzbert-base-finnish-cased-v1zbert-base-finnish-uncased-v1zbert-base-dutch-casedc                 C   s(   ddt j| t jd   }| | S )a   Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
          ?      ?g       @)tfmatherfsqrtxZcdf r   A/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_bert.pygelu9   s     r   c                 C   s:   ddt tdtj | dt | d     }| | S )zGaussian Error Linear Unit.
    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
        x: float Tensor to perform activation.
    Returns:
        `x` with the GELU activation applied.
    r   r      gHm?   )r   tanhnpr   pipowr   r   r   r   gelu_newD   s    	2r   c                 C   s   | t |  S N)r   Zsigmoid)r   r   r   r   swishQ   s    r   )r   relur   r   c                       sH   e Zd ZdZ fddZ fddZddd	Zdd
dZdd Z  Z	S )TFBertEmbeddingszLConstruct the embeddings from word, position and token_type embeddings.
    c                    s   t  jf | |j| _|j| _|j| _tjjj|j	|jt
| jdd| _tjjj|j|jt
| jdd| _tjjj|jdd| _tjj|j| _d S )Nposition_embeddings)Zembeddings_initializernametoken_type_embeddings	LayerNormepsilonr#   )super__init__
vocab_sizehidden_sizeinitializer_ranger   keraslayersZ	EmbeddingZmax_position_embeddingsr   r"   Ztype_vocab_sizer$   LayerNormalizationlayer_norm_epsr%   Dropouthidden_dropout_probdropoutselfconfigkwargs	__class__r   r   r)   a   s$    	zTFBertEmbeddings.__init__c              	      sF   t d& | jd| j| jgt| jd| _W 5 Q R X t 	| dS )z"Build shared word embedding layer word_embeddingsZweight)shapeinitializerN)
r   Z
name_scope
add_weightr*   r+   r   r,   r:   r(   buildr5   input_shaper8   r   r   r>   y   s    
zTFBertEmbeddings.build	embeddingFc                 C   s:   |dkr| j ||dS |dkr(| |S td|dS )a	  Get token embeddings of inputs.
        Args:
            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
            mode: string, a valid value is one of "embedding" and "linear".
        Returns:
            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
                linear tensor, float32 with shape [batch_size, length, vocab_size].
        Raises:
            ValueError: if mode is not valid.

        Shared weights logic adapted from
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        rA   traininglinearzmode {} is not valid.N)
_embedding_linear
ValueErrorformat)r5   inputsmoderC   r   r   r   call   s
    
zTFBertEmbeddings.callc                 C   s   |\}}}}|dk	rt |}nt |dd }|d }|dkr\tj|tjdtjddf }|dkrpt|d}|dkrt| j|}| |}	| 	|}
||	 |
 }| 
|}| j||d}|S )z)Applies embedding based on inputs tensor.Nr   )Zdtyper   rB   )r
   r   rangeZint32newaxisfillZgatherr:   r"   r$   r%   r3   )r5   rI   rC   	input_idsposition_idstoken_type_idsinputs_embedsr@   
seq_lengthr"   r$   
embeddingsr   r   r   rE      s"    



zTFBertEmbeddings._embeddingc                 C   sP   t |d }t |d }t|d| jg}tj|| jdd}t|||| jgS )a  Computes logits by running inputs through a linear layer.
            Args:
                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
            Returns:
                float32 tensor with shape [batch_size, length, vocab_size].
        r   r   rL   TZtranspose_b)r
   r   reshaper+   matmulr:   r*   )r5   rI   
batch_sizelengthr   logitsr   r   r   rF      s
    zTFBertEmbeddings._linear)rA   F)F)
__name__
__module____qualname____doc__r)   r>   rK   rE   rF   __classcell__r   r   r8   r   r!   ]   s   

r!   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	TFBertSelfAttentionc                    s   t  jf | |j|j dkr2td|j|jf |j| _|j| _|j|j dksVtt|j|j | _| j| j | _	t
jjj| j	t|jdd| _t
jjj| j	t|jdd| _t
jjj| j	t|jdd| _t
jj|j| _d S )Nr   zLThe hidden size (%d) is not a multiple of the number of attention heads (%d)querykernel_initializerr#   keyvalue)r(   r)   r+   num_attention_headsrG   output_attentionsAssertionErrorintattention_head_sizeall_head_sizer   r-   r.   Denser   r,   rb   re   rf   r1   Zattention_probs_dropout_probr3   r4   r8   r   r   r)      s8    
      zTFBertSelfAttention.__init__c                 C   s.   t ||d| j| jf}t j|ddddgdS )NrL   r   r   r   r   perm)r   rW   rg   rk   	transpose)r5   r   rY   r   r   r   transpose_for_scores   s    z(TFBertSelfAttention.transpose_for_scoresFc                 C   s  |\}}}t |d }| |}| |}| |}	| ||}
| ||}| |	|}tj|
|dd}tt |d tj}|tj	
| }|d k	r|| }tjj|dd}| j||d}|d k	r|| }t||}tj|dddd	gd
}t||d| jf}| jr||fn|f}|S )Nr   TrV   rL   ZaxisrB   r   r   r   rn   )r
   rb   re   rf   rq   r   rX   castfloat32r   r   nnZsoftmaxr3   rp   rW   rl   rh   )r5   rI   rC   hidden_statesattention_mask	head_maskrY   Zmixed_query_layerZmixed_key_layerZmixed_value_layerZquery_layerZ	key_layerZvalue_layerZattention_scoresZdkZattention_probsZcontext_layeroutputsr   r   r   rK      s:    



   
zTFBertSelfAttention.call)F)r\   r]   r^   r)   rq   rK   r`   r   r   r8   r   ra      s   ra   c                       s&   e Zd Z fddZdddZ  ZS )TFBertSelfOutputc                    sX   t  jf | tjjj|jt|jdd| _	tjjj
|jdd| _tjj|j| _d S Ndenserc   r%   r&   r(   r)   r   r-   r.   rm   r+   r   r,   r|   r/   r0   r%   r1   r2   r3   r4   r8   r   r   r)     s      zTFBertSelfOutput.__init__Fc                 C   s2   |\}}|  |}| j||d}| || }|S NrB   r|   r3   r%   r5   rI   rC   rv   input_tensorr   r   r   rK     s
    
zTFBertSelfOutput.call)Fr\   r]   r^   r)   rK   r`   r   r   r8   r   rz     s   rz   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	TFBertAttentionc                    s.   t  jf | t|dd| _t|dd| _d S )Nr5   r#   output)r(   r)   ra   self_attentionrz   dense_outputr4   r8   r   r   r)   #  s    zTFBertAttention.__init__c                 C   s   t d S r   NotImplementedError)r5   Zheadsr   r   r   prune_heads(  s    zTFBertAttention.prune_headsFc           	      C   sJ   |\}}}| j |||g|d}| j|d |g|d}|f|dd   }|S NrB   r   r   )r   r   )	r5   rI   rC   r   rw   rx   Zself_outputsattention_outputry   r   r   r   rK   +  s
    
zTFBertAttention.call)F)r\   r]   r^   r)   r   rK   r`   r   r   r8   r   r   "  s   r   c                       s$   e Zd Z fddZdd Z  ZS )TFBertIntermediatec                    sR   t  jf | tjjj|jt|jdd| _	t
|jtrFt|j | _n|j| _d S )Nr|   rc   )r(   r)   r   r-   r.   rm   Zintermediate_sizer   r,   r|   
isinstance
hidden_actstrACT2FNintermediate_act_fnr4   r8   r   r   r)   5  s      zTFBertIntermediate.__init__c                 C   s   |  |}| |}|S r   )r|   r   r5   rv   r   r   r   rK   ?  s    

zTFBertIntermediate.callr   r   r   r8   r   r   4  s   
r   c                       s&   e Zd Z fddZdddZ  ZS )TFBertOutputc                    sX   t  jf | tjjj|jt|jdd| _	tjjj
|jdd| _tjj|j| _d S r{   r}   r4   r8   r   r   r)   F  s      zTFBertOutput.__init__Fc                 C   s2   |\}}|  |}| j||d}| || }|S r~   r   r   r   r   r   rK   N  s
    
zTFBertOutput.call)Fr   r   r   r8   r   r   E  s   r   c                       s&   e Zd Z fddZdddZ  ZS )TFBertLayerc                    s<   t  jf | t|dd| _t|dd| _t|dd| _d S )N	attentionr   intermediater   )r(   r)   r   r   r   r   r   bert_outputr4   r8   r   r   r)   X  s    zTFBertLayer.__init__Fc                 C   sX   |\}}}| j |||g|d}|d }| |}| j||g|d}	|	f|dd   }
|
S r   )r   r   r   )r5   rI   rC   rv   rw   rx   Zattention_outputsr   Zintermediate_outputZlayer_outputry   r   r   r   rK   ^  s    

zTFBertLayer.call)Fr   r   r   r8   r   r   W  s   r   c                       s&   e Zd Z fddZdddZ  ZS )TFBertEncoderc                    s<   t  jf |  j| _ j| _ fddt jD | _d S )Nc                    s   g | ]}t  d |dqS )z
layer_._{}r   )r   rH   ).0ir6   r   r   
<listcomp>n  s     z*TFBertEncoder.__init__.<locals>.<listcomp>)r(   r)   rh   output_hidden_statesrM   num_hidden_layerslayerr4   r8   r   r   r)   j  s    zTFBertEncoder.__init__Fc                 C   s   |\}}}d}d}t | jD ]J\}}	| jr4||f }|	|||| g|d}
|
d }| jr||
d f }q| jrx||f }|f}| jr||f }| jr||f }|S )Nr   rB   r   r   )	enumerater   r   rh   )r5   rI   rC   rv   rw   rx   Zall_hidden_statesZall_attentionsr   Zlayer_moduleZlayer_outputsry   r   r   r   rK   p  s$    




zTFBertEncoder.call)Fr   r   r   r8   r   r   i  s   r   c                       s$   e Zd Z fddZdd Z  ZS )TFBertPoolerc                    s2   t  jf | tjjj|jt|jddd| _	d S )Nr   r|   )rd   Z
activationr#   )
r(   r)   r   r-   r.   rm   r+   r   r,   r|   r4   r8   r   r   r)     s    zTFBertPooler.__init__c                 C   s   |d d df }|  |}|S )Nr   )r|   )r5   rv   Zfirst_token_tensorpooled_outputr   r   r   rK     s    
zTFBertPooler.callr   r   r   r8   r   r     s   	r   c                       s$   e Zd Z fddZdd Z  ZS )TFBertPredictionHeadTransformc                    sh   t  jf | tjjj|jt|jdd| _	t
|jtrFt|j | _n|j| _tjjj|jdd| _d S r{   )r(   r)   r   r-   r.   rm   r+   r   r,   r|   r   r   r   r   transform_act_fnr/   r0   r%   r4   r8   r   r   r)     s      z&TFBertPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S r   )r|   r   r%   r   r   r   r   rK     s    


z"TFBertPredictionHeadTransform.callr   r   r   r8   r   r     s   r   c                       s0   e Zd Z fddZ fddZdd Z  ZS )TFBertLMPredictionHeadc                    s.   t  jf | |j| _t|dd| _|| _d S )N	transformr   )r(   r)   r*   r   r   input_embeddingsr5   r6   r   r7   r8   r   r   r)     s    zTFBertLMPredictionHead.__init__c                    s(   | j | jfdddd| _t | d S )NzerosTbias)r;   r<   Z	trainabler#   )r=   r*   r   r(   r>   r?   r8   r   r   r>     s    zTFBertLMPredictionHead.buildc                 C   s&   |  |}| j|dd}|| j }|S )NrD   )rJ   )r   r   r   r   r   r   r   rK     s    

zTFBertLMPredictionHead.call)r\   r]   r^   r)   r>   rK   r`   r   r   r8   r   r     s   	r   c                       s$   e Zd Z fddZdd Z  ZS )TFBertMLMHeadc                    s"   t  jf | t||dd| _d S )Npredictionsr   )r(   r)   r   r   r   r8   r   r   r)     s    zTFBertMLMHead.__init__c                 C   s   |  |}|S r   )r   )r5   sequence_outputprediction_scoresr   r   r   rK     s    
zTFBertMLMHead.callr   r   r   r8   r   r     s   r   c                       s$   e Zd Z fddZdd Z  ZS )TFBertNSPHeadc                    s.   t  jf | tjjjdt|jdd| _d S )Nr   seq_relationshiprc   )	r(   r)   r   r-   r.   rm   r   r,   r   r4   r8   r   r   r)     s      zTFBertNSPHead.__init__c                 C   s   |  |}|S r   )r   )r5   r   seq_relationship_scorer   r   r   rK     s    
zTFBertNSPHead.callr   r   r   r8   r   r     s   r   c                       sB   e Zd ZeZ fddZdd Zdd Zdd ZdddZ	  Z
S )TFBertMainLayerc                    sD   t  jf | |j| _t|dd| _t|dd| _t|dd| _d S )NrU   r   encoderpooler)	r(   r)   r   r!   rU   r   r   r   r   r4   r8   r   r   r)     s
    zTFBertMainLayer.__init__c                 C   s   | j S r   )rU   r5   r   r   r   get_input_embeddings  s    z$TFBertMainLayer.get_input_embeddingsc                 C   s   t d S r   r   )r5   Znew_num_tokensr   r   r   _resize_token_embeddings  s    z(TFBertMainLayer._resize_token_embeddingsc                 C   s   t dS )z Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        Nr   )r5   Zheads_to_pruner   r   r   _prune_heads  s    zTFBertMainLayer._prune_headsNFc                 C   s>  t |ttfr|d }t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkstdnrt |ttfr|d	}|d
|}|d|}|d|}|d|}|d|}t|dkstdn|}|d k	r4|d k	r4tdn8|d k	rHt	|}	n$|d k	rdt	|d d }	ntd|d krt
|	d}|d krt
|	d}|d d t
jt
jd d f }
t
|
t
j}
d|
 d }
|d k	rtnd g| j }| j||||g|d}| j||
|g|d}|d }| |}||f|dd   }|S )Nr   r   r   r            Too many inputs.rP   rw   rR   rQ   rx   rS   zDYou cannot specify both input_ids and inputs_embeds at the same timerL   z5You have to specify either input_ids or inputs_embedsr   g     rB   )r   tuplelistlenri   dictr   getrG   r
   r   rO   rN   rs   rt   r   r   rU   r   r   )r5   rI   rw   rR   rQ   rx   rS   rC   rP   r@   Zextended_attention_maskZembedding_outputZencoder_outputsr   r   ry   r   r   r   rK     sT    










zTFBertMainLayer.call)NNNNNF)r\   r]   r^   r   config_classr)   r   r   r   rK   r`   r   r   r8   r   r     s   
      r   c                   @   s   e Zd ZdZeZeZdZdS )TFBertPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    bertN)	r\   r]   r^   r_   r   r   $TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixr   r   r   r   r   F  s   r   a  
    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
    Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.

    .. note::

        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Parameters:
        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`__
        position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`__
        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
            (if set to :obj:`False`) for evaluation.
z]The bare Bert Model transformer outputing raw hidden-states without any specific head on top.c                       s,   e Zd Z fddZeedd Z  ZS )TFBertModelc                    s&   t  j|f|| t|dd| _d S )Nr   r   )r(   r)   r   r   r5   r6   rI   r7   r8   r   r   r)     s    zTFBertModel.__init__c                 K   s   | j |f|}|S )a  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
            Last layer hidden-state of the first token of the sequence (classification token)
            further processed by a Linear layer and a Tanh activation function. The Linear
            layer weights are trained from the next sentence prediction (classification)
            objective during Bert pretraining. This output is usually *not* a good summary
            of the semantic content of the input, you're often better with averaging or pooling
            the sequence of hidden-states for the whole input sequence.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.


    Examples::

        import tensorflow as tf
        from transformers import BertTokenizer, TFBertModel

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertModel.from_pretrained('bert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        )r   )r5   rI   r7   ry   r   r   r   rK     s    %zTFBertModel.callr\   r]   r^   r)   r   BERT_INPUTS_DOCSTRINGrK   r`   r   r   r8   r   r     s   r   zBert Model with two heads on top as done during the pre-training:
    a `masked language modeling` head and a `next sentence prediction (classification)` head. c                       s4   e Zd Z fddZdd Zeedd Z  ZS )TFBertForPreTrainingc                    sH   t  j|f|| t|dd| _t|dd| _t|| jjdd| _d S )Nr   r   	nsp___cls	mlm___cls)	r(   r)   r   r   r   nspr   rU   mlmr   r8   r   r   r)     s    zTFBertForPreTraining.__init__c                 C   s   | j jS r   r   rU   r   r   r   r   get_output_embeddings  s    z*TFBertForPreTraining.get_output_embeddingsc                 K   sV   | j |f|}|dd \}}| j||ddd}| |}||f|dd  }|S )ae  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import BertTokenizer, TFBertForPreTraining

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        prediction_scores, seq_relationship_scores = outputs[:2]

        Nr   rC   FrB   )r   r   r   r   )r5   rI   r7   ry   r   r   r   r   r   r   r   rK     s     

zTFBertForPreTraining.call	r\   r]   r^   r)   r   r   r   rK   r`   r   r   r8   r   r     s   r   z3Bert Model with a `language modeling` head on top. c                       s4   e Zd Z fddZdd Zeedd Z  ZS )TFBertForMaskedLMc                    s:   t  j|f|| t|dd| _t|| jjdd| _d S )Nr   r   r   )r(   r)   r   r   r   rU   r   r   r8   r   r   r)     s    zTFBertForMaskedLM.__init__c                 C   s   | j jS r   r   r   r   r   r   r     s    z'TFBertForMaskedLM.get_output_embeddingsc                 K   sB   | j |f|}|d }| j||ddd}|f|dd  }|S )an  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import BertTokenizer, TFBertForMaskedLM

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        prediction_scores = outputs[0]

        r   rC   FrB   r   N)r   r   r   )r5   rI   r7   ry   r   r   r   r   r   rK     s
    zTFBertForMaskedLM.callr   r   r   r8   r   r     s   r   zKBert Model with a `next sentence prediction (classification)` head on top. c                       s,   e Zd Z fddZeedd Z  ZS )TFBertForNextSentencePredictionc                    s4   t  j|f|| t|dd| _t|dd| _d S )Nr   r   r   )r(   r)   r   r   r   r   r   r8   r   r   r)   =  s    z(TFBertForNextSentencePrediction.__init__c                 K   s6   | j |f|}|d }| |}|f|dd  }|S )a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`)
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import BertTokenizer, TFBertForNextSentencePrediction

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        seq_relationship_scores = outputs[0]

        r   r   N)r   r   )r5   rI   r7   ry   r   r   r   r   r   rK   C  s
    
z$TFBertForNextSentencePrediction.callr   r   r   r8   r   r   9  s   r   zBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s,   e Zd Z fddZeedd Z  ZS )TFBertForSequenceClassificationc                    s^   t  j|f|| |j| _t|dd| _tjj|j	| _
tjjj|jt|jdd| _d S Nr   r   
classifierrc   r(   r)   
num_labelsr   r   r   r-   r.   r1   r2   r3   rm   r   r,   r   r   r8   r   r   r)   q  s      z(TFBertForSequenceClassification.__init__c                 K   sL   | j |f|}|d }| j||ddd}| |}|f|dd  }|S )aP  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import BertTokenizer, TFBertForSequenceClassification

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]

        r   rC   FrB   r   Nr   r3   r   r   )r5   rI   r7   ry   r   r[   r   r   r   rK   {  s    
z$TFBertForSequenceClassification.callr   r   r   r8   r   r   k  s   
r   zBert Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. c                       s:   e Zd Z fddZedd Zeed	ddZ  Z	S )
TFBertForMultipleChoicec                    sT   t  j|f|| t|dd| _tjj|j| _	tjjj
dt|jdd| _d S )Nr   r   r   r   rc   )r(   r)   r   r   r   r-   r.   r1   r2   r3   rm   r   r,   r   r   r8   r   r   r)     s      z TFBertForMultipleChoice.__init__c                 C   s   dt tiS )zf Dummy inputs to build the network.

        Returns:
            tf.Tensor with dummy inputs
        rP   )r   Zconstantr   r   r   r   r   dummy_inputs  s    z$TFBertForMultipleChoice.dummy_inputsNFc                 C   s*  t |ttfr|d }t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkstdnnt |tr|d	}|d
|}|d|}|d|}|d|}|d|}t|dkstdn|}|dk	r6t|d }	t|d }
nt|d }	t|d }
|dk	rht	|d|
fnd}|dk	rt	|d|
fnd}|dk	rt	|d|
fnd}|dk	rt	|d|
fnd}||||||g}| j
||d}|d }| j||d}| |}t	|d|	f}|f|dd  }|S )a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).

            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import BertTokenizer, TFBertForMultipleChoice

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
        outputs = model(input_ids)
        classification_scores = outputs[0]

        r   r   r   r   r   r   r   r   rP   rw   rR   rQ   rx   rS   NrL   rB   )r   r   r   r   ri   r   r   r
   r   rW   r   r3   r   )r5   rI   rw   rR   rQ   rx   rS   rC   rP   Znum_choicesrT   Zflat_input_idsZflat_attention_maskZflat_token_type_idsZflat_position_idsZflat_inputsry   r   r[   Zreshaped_logitsr   r   r   rK     sP    *

	
zTFBertForMultipleChoice.call)NNNNNF)
r\   r]   r^   r)   propertyr   r   r   rK   r`   r   r   r8   r   r     s   	
      r   zBert Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. c                       s,   e Zd Z fddZeedd Z  ZS )TFBertForTokenClassificationc                    s^   t  j|f|| |j| _t|dd| _tjj|j	| _
tjjj|jt|jdd| _d S r   r   r   r8   r   r   r)   "  s      z%TFBertForTokenClassification.__init__c                 K   sL   | j |f|}|d }| j||ddd}| |}|f|dd  }|S )a3  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import BertTokenizer, TFBertForTokenClassification

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        scores = outputs[0]

        r   rC   FrB   r   Nr   )r5   rI   r7   ry   r   r[   r   r   r   rK   ,  s    
z!TFBertForTokenClassification.callr   r   r   r8   r   r     s   
r   zBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s,   e Zd Z fddZeedd Z  ZS )TFBertForQuestionAnsweringc                    sL   t  j|f|| |j| _t|dd| _tjjj|jt	|j
dd| _d S )Nr   r   
qa_outputsrc   )r(   r)   r   r   r   r   r-   r.   rm   r   r,   r   r   r8   r   r   r)   \  s      z#TFBertForQuestionAnswering.__init__c                 K   sh   | j |f|}|d }| |}tj|ddd\}}tj|dd}tj|dd}||f|dd  }|S )ad  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-start scores (before SoftMax).
        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import BertTokenizer, TFBertForQuestionAnswering

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
        encoding = tokenizer.encode_plus(question, text)
        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
        start_scores, end_scores = model(tf.constant(input_ids)[None, :], token_type_ids=tf.constant(token_type_ids)[None, :])

        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer = ' '.join(all_tokens[tf.math.argmax(tf.squeeze(start_scores)) : tf.math.argmax(tf.squeeze(end_scores))+1])
        assert answer == "a nice puppet"

        r   r   rL   rr   N)r   r   r   splitZsqueeze)r5   rI   r7   ry   r   r[   Zstart_logitsZ
end_logitsr   r   r   rK   e  s    &
zTFBertForQuestionAnswering.callr   r   r   r8   r   r   V  s   	r   ):r_   loggingZnumpyr   Z
tensorflowr   Zconfiguration_bertr   Z
file_utilsr   r   r   Zmodeling_tf_utilsr   r   r	   r
   Ztokenization_utilsr   	getLoggerr\   loggerr   r   r   r   r-   r.   Z
ActivationZactivationsr    r   ZLayerr!   ra   rz   r   r   r   r   r   r   r   r   r   r   r   r   ZBERT_START_DOCSTRINGr   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s   
gL"
j
).82 /5r5