U
    &cT                     @   s  d Z ddlZddlZddlZddlZddlmZ ddl	m
Z
mZ ddlmZmZmZmZ ddlmZ eeZdd	d
ddddZdd Zdd ZG dd dejjjZG dd dejjjZG dd dejjjZG dd dejjjZG dd dejjjZ G dd dejjjZ!G dd  d eZ"d!Z#d"Z$e
d#e#G d$d% d%e"Z%G d&d' d'ejjjZ&e
d(e#G d)d* d*e"Z'e
d+e#G d,d- d-e"Z(e
d.e#G d/d0 d0e"Z)e
d1e#G d2d3 d3e"Z*dS )4z TF 2.0 DistilBERT model
    N   )DistilBertConfig)add_start_docstrings add_start_docstrings_to_callable)TFPreTrainedModelTFSharedEmbeddingsget_initializer
shape_list)BatchEncodingz>https://cdn.huggingface.co/distilbert-base-uncased-tf_model.h5zNhttps://cdn.huggingface.co/distilbert-base-uncased-distilled-squad-tf_model.h5z<https://cdn.huggingface.co/distilbert-base-cased-tf_model.h5zLhttps://cdn.huggingface.co/distilbert-base-cased-distilled-squad-tf_model.h5zIhttps://cdn.huggingface.co/distilbert-base-multilingual-cased-tf_model.h5zVhttps://cdn.huggingface.co/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5)zdistilbert-base-uncasedz'distilbert-base-uncased-distilled-squadzdistilbert-base-casedz%distilbert-base-cased-distilled-squadz"distilbert-base-multilingual-casedz/distilbert-base-uncased-finetuned-sst-2-englishc                 C   s(   ddt j| t jd   }| | S )a   Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
          ?      ?g       @)tfmatherfsqrtxZcdf r   G/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_distilbert.pygelu-   s     r   c                 C   s:   ddt tdtj | dt | d     }| | S )zGaussian Error Linear Unit.
    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
        x: float Tensor to perform activation.
    Returns:
        `x` with the GELU activation applied.
    r   r      gHm?   )r   tanhnpr   pipowr   r   r   r   gelu_new8   s    	2r   c                       sD   e Zd Z fddZ fddZddd	Zdd
dZdd Z  ZS )TFEmbeddingsc                    s   t  jf | |j| _|j| _|j| _t|j|j|jdd| _tjj	j
|j|jt|jdd| _tjj	jddd| _tjj	|j| _d S )Nword_embeddings)initializer_rangenameposition_embeddings)Zembeddings_initializerr    -q=	LayerNormepsilonr    )super__init__
vocab_sizedimr   r   r   r   keraslayersZ	EmbeddingZmax_position_embeddingsr   r!   LayerNormalizationr#   Dropoutdropoutselfconfigkwargs	__class__r   r   r'   F   s$       zTFEmbeddings.__init__c              	      sF   t d& | jd| j| jgt| jd| _W 5 Q R X t 	| dS )z"Build shared word embedding layer r   Zweight)shapeinitializerN)
r   Z
name_scope
add_weightr(   r)   r   r   r   r&   buildr0   input_shaper3   r   r   r8   X   s     
 zTFEmbeddings.buildN	embeddingFc                 C   s<   |dkr| j |||dS |dkr*| |S td|dS )a	  Get token embeddings of inputs.
        Args:
            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
            mode: string, a valid value is one of "embedding" and "linear".
        Returns:
            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
                linear tensor, float32 with shape [batch_size, length, vocab_size].
        Raises:
            ValueError: if mode is not valid.

        Shared weights logic adapted from
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        r;   )inputs_embedstraininglinearzmode {} is not valid.N)
_embedding_linear
ValueErrorformat)r0   inputsr<   moder=   r   r   r   callb   s
    
zTFEmbeddings.callc           	      C   s   t |ttfs|}d}n|\}}|dk	r6t|d }nt|d }|dkrhtj|tjdtjddf }|dkr~t| j	|}| 
|}|| }| |}| j||d}|S )a1  
        Parameters
        ----------
        input_ids: tf.Tensor(bs, max_seq_length)
            The token ids to embed.

        Outputs
        -------
        embeddings: tf.Tensor(bs, max_seq_length, dim)
            The embedded tokens (plus position embeddings, no token_type embeddings)
        Nr   Zdtyper=   )
isinstancetuplelistr	   r   rangeZint32ZnewaxisZgatherr   r!   r#   r.   )	r0   rC   r<   r=   	input_idsZposition_idsZ
seq_lengthr!   
embeddingsr   r   r   r?   x   s     

zTFEmbeddings._embeddingc                 C   sP   t |d }t |d }t|d| jg}tj|| jdd}t|||| jgS )a  Computes logits by running inputs through a linear layer.
            Args:
                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
            Returns:
                float32 tensor with shape [batch_size, length, vocab_size].
        r   r   TZtranspose_b)r	   r   reshaper)   matmulr   r(   )r0   rC   Z
batch_sizelengthr   logitsr   r   r   r@      s
    zTFEmbeddings._linear)Nr;   F)NF)	__name__
__module____qualname__r'   r8   rE   r?   r@   __classcell__r   r   r3   r   r   E   s
   


#r   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	TFMultiHeadSelfAttentionc                    s   t  jf | |j| _|j| _tjj|j| _	|j
| _
| j| j dksLttjjj|jt|jdd| _tjjj|jt|jdd| _tjjj|jt|jdd| _tjjj|jt|jdd| _t | _d S )Nr   q_linkernel_initializerr    k_linv_linout_lin)r&   r'   n_headsr)   r   r*   r+   r-   Zattention_dropoutr.   output_attentionsAssertionErrorDenser   r   rY   r\   r]   r^   setZpruned_headsr/   r3   r   r   r'      s6            z!TFMultiHeadSelfAttention.__init__c                 C   s   t d S NNotImplementedError)r0   Zheadsr   r   r   prune_heads   s    z$TFMultiHeadSelfAttention.prune_headsFc                    s*  |\}}}}}t |\ }}	t |d }
jj  dd|
g} fdd} fdd}||}||}||}|t }tj	||dd}t
||}|dd	|   }tjj|d
d}j||d}|dk	r|| }t	||}||}|}jr ||fS |fS dS )a  
        Parameters
        ----------
        query: tf.Tensor(bs, seq_length, dim)
        key: tf.Tensor(bs, seq_length, dim)
        value: tf.Tensor(bs, seq_length, dim)
        mask: tf.Tensor(bs, seq_length)

        Outputs
        -------
        weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
            Attention weights
        context: tf.Tensor(bs, seq_length, dim)
            Contextualized layer. Optional: only if `output_attentions=True`
        r   c                    s    t jt |  djfddS )z separate heads rN   r   r   r   r   perm)r   	transposerP   r_   r   bsZdim_per_headr0   r   r   r5      s    z,TFMultiHeadSelfAttention.call.<locals>.shapec                    s"   t t j| dd dj fS )z group heads rh   ri   rN   )r   rP   rk   r_   rl   rm   r   r   unshape   s    z.TFMultiHeadSelfAttention.call.<locals>.unshapeTrO   gꌠ9Y>)Fr   rN   ZaxisrG   N)r	   r)   r_   rY   r\   r]   r   r   r   rQ   rP   nnZsoftmaxr.   r^   r`   )r0   rC   r=   querykeyvaluemask	head_maskZq_lengthr)   Zk_lengthZmask_reshaper5   ro   qkvZscoresweightscontextr   rm   r   rE      s0    
zTFMultiHeadSelfAttention.call)F)rT   rU   rV   r'   rg   rE   rW   r   r   r3   r   rX      s   rX   c                       s&   e Zd Z fddZdddZ  ZS )TFFFNc                    s   t  jf | tjj|j| _tjjj|jt	|j
dd| _tjjj|jt	|j
dd| _|jdksvtd|j|jdkrtjjtntjjj| _d S )Nlin1rZ   lin2)relur   z+activation ({}) must be in ['relu', 'gelu']r   )r&   r'   r   r*   r+   r-   r.   rb   
hidden_dimr   r   r}   r)   r~   
activationra   rB   
Activationr   Zactivationsr   r/   r3   r   r   r'     s"         zTFFFN.__init__Fc                 C   s0   |  |}| |}| |}| j||d}|S )NrG   )r}   r   r~   r.   )r0   inputr=   r   r   r   r   rE     s
    


z
TFFFN.call)FrT   rU   rV   r'   rE   rW   r   r   r3   r   r|     s   r|   c                       s&   e Zd Z fddZdddZ  ZS )TFTransformerBlockc                    s   t  jf | |j| _|j| _|j| _tjj|j	| _	|j
| _
|j| _|j|j dks\tt|dd| _tjjjddd| _t|dd| _tjjjddd| _d S )	Nr   	attentionr    r"   sa_layer_normr$   ffnoutput_layer_norm)r&   r'   r_   r)   r   r   r*   r+   r-   r.   r   r`   ra   rX   r   r,   r   r|   r   r   r/   r3   r   r   r'     s    zTFTransformerBlock.__init__Fc           
      C   s~   |\}}}| j |||||g|d}| jr2|\}}n|d }| || }| j||d}| || }|f}	| jrz|f|	 }	|	S )a  
        Parameters
        ----------
        x: tf.Tensor(bs, seq_length, dim)
        attn_mask: tf.Tensor(bs, seq_length)

        Outputs
        -------
        sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
            The attention weights
        ffn_output: tf.Tensor(bs, seq_length, dim)
            The output of the transformer block contextualization.
        rG   r   )r   r`   r   r   r   )
r0   rC   r=   r   	attn_maskrv   Z	sa_outputZ
sa_weightsZ
ffn_outputoutputr   r   r   rE   1  s    


zTFTransformerBlock.call)Fr   r   r   r3   r   r     s   r   c                       s&   e Zd Z fddZdddZ  ZS )TFTransformerc                    sD   t  jf |  j| _ j| _ j| _ fddt jD | _d S )Nc                    s   g | ]}t  d |dqS )z
layer_._{}r   )r   rB   ).0ir1   r   r   
<listcomp>[  s     z*TFTransformer.__init__.<locals>.<listcomp>)r&   r'   Zn_layersr`   output_hidden_statesrK   layerr/   r3   r   r   r'   U  s
    zTFTransformer.__init__Fc                 C   s   |\}}}d}d}|}t | jD ]p\}	}
| jr8||f }|
||||	 g|d}|d }| jrt|dkslt|d }||f }q t|dks tq | jr||f }|f}| jr||f }| jr||f }|S )a  
        Parameters
        ----------
        x: tf.Tensor(bs, seq_length, dim)
            Input sequence embedded.
        attn_mask: tf.Tensor(bs, seq_length)
            Attention mask on the sequence.

        Outputs
        -------
        hidden_state: tf.Tensor(bs, seq_length, dim)
            Sequence of hiddens states in the last (top) layer
        all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
            Tuple of length n_layers with the hidden states from each layer.
            Optional: only if output_hidden_states=True
        all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
            Tuple of length n_layers with the attention weights from each layer
            Optional: only if output_attentions=True
        r   rG   rN   r   r   r   )	enumerater   r   r`   lenra   )r0   rC   r=   r   r   rv   Zall_hidden_statesZall_attentionshidden_stater   Zlayer_moduleZlayer_outputsZ
attentionsoutputsr   r   r   rE   ]  s,    




zTFTransformer.call)Fr   r   r   r3   r   r   T  s   r   c                       s>   e Zd Z fddZdd Zdd Zdd ZdddZ  ZS )TFDistilBertMainLayerc                    s6   t  jf | |j| _t|dd| _t|dd| _d S )NrM   r   transformer)r&   r'   num_hidden_layersr   rM   r   r   r/   r3   r   r   r'     s    zTFDistilBertMainLayer.__init__c                 C   s   | j S rd   )rM   r0   r   r   r   get_input_embeddings  s    z*TFDistilBertMainLayer.get_input_embeddingsc                 C   s   t d S rd   re   )r0   Znew_num_tokensr   r   r   _resize_token_embeddings  s    z.TFDistilBertMainLayer._resize_token_embeddingsc                 C   s   t d S rd   re   )r0   Zheads_to_pruner   r   r   _prune_heads  s    z"TFDistilBertMainLayer._prune_headsNFc           
      C   s  t |ttfrt|d }t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkstdnVt |ttfr|d}|d|}|d	|}|d
|}t|dkstdn|}|d k	r|d k	rtdn6|d k	rt	|}n$|d k	rt	|d d }ntd|d kr.t
|}t
j|t
jd}|d k	rNtnd g| j }| j||d}| j|||g|d}	|	S )Nr   r   r   r      zToo many inputs.rL   attention_maskrv   r<   zDYou cannot specify both input_ids and inputs_embeds at the same timerN   z5You have to specify either input_ids or inputs_embedsrF   )r<   rG   )rH   rI   rJ   r   ra   dictr
   getrA   r	   r   ZonescastZfloat32rf   r   rM   r   )
r0   rC   r   rv   r<   r=   rL   r:   Zembedding_outputZtfmr_outputr   r   r   rE     s:    






zTFDistilBertMainLayer.call)NNNF)	rT   rU   rV   r'   r   r   r   rE   rW   r   r   r3   r   r     s
   r   c                   @   s   e Zd ZdZeZeZdZdS )TFDistilBertPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    
distilbertN)	rT   rU   rV   __doc__r   Zconfig_class*TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixr   r   r   r   r     s   r   a  
    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
    Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.

    .. note::

        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Parameters:
        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
            (if set to :obj:`False`) for evaluation.

zeThe bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.c                       s,   e Zd Z fddZeedd Z  ZS )TFDistilBertModelc                    s&   t  j|f|| t|dd| _d S )Nr   r   )r&   r'   r   r   r0   r1   rC   r2   r3   r   r   r'     s    zTFDistilBertModel.__init__c                 K   s   | j |f|}|S )a{  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import DistilBertTokenizer, TFDistilBertModel

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = TFDistilBertModel.from_pretrained('distilbert-base-cased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        )r   )r0   rC   r2   r   r   r   r   rE      s    zTFDistilBertModel.callrT   rU   rV   r'   r   DISTILBERT_INPUTS_DOCSTRINGrE   rW   r   r   r3   r   r     s   r   c                       s0   e Zd Z fddZ fddZdd Z  ZS )TFDistilBertLMHeadc                    s    t  jf | |j| _|| _d S rd   )r&   r'   r(   input_embeddings)r0   r1   r   r2   r3   r   r   r'   B  s    zTFDistilBertLMHead.__init__c                    s(   | j | jfdddd| _t | d S )NzerosTbias)r5   r6   Z	trainabler    )r7   r(   r   r&   r8   r9   r3   r   r   r8   J  s    zTFDistilBertLMHead.buildc                 C   s   | j |dd}|| j }|S )Nr>   )rD   )r   r   )r0   hidden_statesr   r   r   rE   N  s    
zTFDistilBertLMHead.call)rT   rU   rV   r'   r8   rE   rW   r   r   r3   r   r   A  s   r   z@DistilBert Model with a `masked language modeling` head on top. c                       s4   e Zd Z fddZdd Zeedd Z  ZS )TFDistilBertForMaskedLMc                    s   t  j|f|| |j| _|j| _|j| _t|dd| _tjj	j
|jt|jdd| _tjj	t| _tjj	jddd| _t|| jjdd| _d S )	Nr   r   vocab_transformrZ   r"   vocab_layer_normr$   vocab_projector)r&   r'   r`   r   r(   r   r   r   r*   r+   rb   r)   r   r   r   r   r   actr,   r   r   rM   r   r   r3   r   r   r'   X  s      z TFDistilBertForMaskedLM.__init__c                 C   s   | j jS rd   )r   r   r   r   r   r   get_output_embeddingsf  s    z-TFDistilBertForMaskedLM.get_output_embeddingsc                 K   sT   | j |f|}|d }| |}| |}| |}| |}|f|dd  }|S )a}  

    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        prediction_scores = outputs[0]

        r   r   N)r   r   r   r   r   )r0   rC   r2   distilbert_outputr   Zprediction_logitsr   r   r   r   rE   i  s    



zTFDistilBertForMaskedLM.call)	rT   rU   rV   r'   r   r   r   rE   rW   r   r   r3   r   r   T  s   r   zDistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s,   e Zd Z fddZeedd Z  ZS )%TFDistilBertForSequenceClassificationc                    s~   t  j|f|| |j| _t|dd| _tjjj|j	t
|jddd| _tjjj|jt
|jdd| _tjj|j| _d S )Nr   r   r   pre_classifier)r[   r   r    
classifierrZ   )r&   r'   
num_labelsr   r   r   r*   r+   rb   r)   r   r   r   r   r-   Zseq_classif_dropoutr.   r   r3   r   r   r'     s      z.TFDistilBertForSequenceClassification.__init__c                 K   sf   | j |f|}|d }|dddf }| |}| j||ddd}| |}|f|dd  }|S )a^  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]

        r   Nr=   FrG   r   )r   r   r.   r   r   )r0   rC   r2   r   r   Zpooled_outputrS   r   r   r   r   rE     s    

z*TFDistilBertForSequenceClassification.callr   r   r   r3   r   r     s   r   zDistilBert Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. c                       s,   e Zd Z fddZeedd Z  ZS )"TFDistilBertForTokenClassificationc                    s^   t  j|f|| |j| _t|dd| _tjj|j	| _	tjjj
|jt|jdd| _d S )Nr   r   r   rZ   )r&   r'   r   r   r   r   r*   r+   r-   r.   rb   r   r   r   r   r3   r   r   r'     s      z+TFDistilBertForTokenClassification.__init__c                 K   sL   | j |f|}|d }| j||ddd}| |}|f|dd  }|S )a@  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        scores = outputs[0]
        r   r=   FrG   r   N)r   r.   r   r   )r0   rC   r2   r   Zsequence_outputrS   r   r   r   rE     s    
z'TFDistilBertForTokenClassification.callr   r   r   r3   r   r     s   
r   zDistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s,   e Zd Z fddZeedd Z  ZS ) TFDistilBertForQuestionAnsweringc                    sd   t  j|f|| t|dd| _tjjj|jt	|j
dd| _|jdksNttjj|j| _d S )Nr   r   
qa_outputsrZ   r   )r&   r'   r   r   r   r*   r+   rb   r   r   r   r   ra   r-   Z
qa_dropoutr.   r   r3   r   r   r'     s      z)TFDistilBertForQuestionAnswering.__init__c           	      K   s~   | j |f|}|d }| j||ddd}| |}tj|ddd\}}tj|dd}tj|dd}||f|dd	  }|S )
a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
        start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-start scores (before SoftMax).
        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        import tensorflow as tf
        from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
        model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        start_scores, end_scores = outputs[:2]

        r   r=   FrG   r   rN   rp   r   N)r   r.   r   r   r   splitZsqueeze)	r0   rC   r2   r   r   rS   Zstart_logitsZ
end_logitsr   r   r   r   rE     s     
z%TFDistilBertForQuestionAnswering.callr   r   r   r3   r   r     s   
r   )+r   loggingr   Znumpyr   Z
tensorflowr   Zconfiguration_distilbertr   Z
file_utilsr   r   Zmodeling_tf_utilsr   r   r   r	   Ztokenization_utilsr
   	getLoggerrT   loggerr   r   r   r*   r+   ZLayerr   rX   r|   r   r   r   r   ZDISTILBERT_START_DOCSTRINGr   r   r   r   r   r   r   r   r   r   r   <module>   sj   
fZ6=?
& =;4