U
    &cw                     @   sn  d Z ddlZddlZddlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ eeZdd	iZd
d Zdd ZejjeejjjejjedZG dd dejjj Z!G dd dejjj Z"G dd dejjj Z#G dd dejjj Z$G dd deZ%dZ&dZ'e	de&G dd de%Z(e	de&G dd  d e%Z)e	d!e&G d"d# d#e%Z*dS )$z TF 2.0 OpenAI GPT model.    N   )OpenAIGPTConfig)add_start_docstrings add_start_docstrings_to_callable)TFConv1DTFPreTrainedModelTFSequenceSummaryTFSharedEmbeddingsget_initializer
shape_list)BatchEncodingz
openai-gptz1https://cdn.huggingface.co/openai-gpt-tf_model.h5c                 C   s:   ddt tdtj | dt | d     }| | S )zGaussian Error Linear Unit.
    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
        x: float Tensor to perform activation.
    Returns:
        `x` with the GELU activation applied.
    g      ?      ?   gHm?   )tftanhnpsqrtpipow)xZcdf r   C/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_openai.pygelu*   s    	2r   c                 C   s   | t j|  S N)r   mathZsigmoid)r   r   r   r   swish7   s    r   )r   relur   c                       sV   e Zd Zd fdd	Zdd Zedd Zddd	Zd
d Zdd Z	dddZ
  ZS )TFAttentionFc                    s   t  jf | |j| _|}||j dks,t|| _|j| _|| _|| _t|d ||j	dd| _
t|||j	dd| _tjj|j| _tjj|j| _t | _d S )Nr   r   c_attninitializer_rangenamec_proj)super__init__output_attentionsn_headAssertionErrorn_ctxZ
split_sizescaler   r!   r   r#   r   keraslayersDropoutZ
attn_pdropattn_dropoutresid_pdropresid_dropoutsetZpruned_heads)selfnxr)   configr*   kwargsn_state	__class__r   r   r%   C   s    zTFAttention.__init__c                 C   s   d S r   r   )r2   Zheadsr   r   r   prune_headsU   s    zTFAttention.prune_headsc                 C   s<   t | dddf }t |}||| |  k}t ||S )z1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        N)r   rangecast)ndnsdtypeijmr   r   r   causal_attention_maskX   s    
z!TFAttention.causal_attention_maskc                 C   s   |\}}}}}t j||dd}| jrJt t|d t j}	|t j|	 }t|\}
}
}}| j|||j	d}t 
|dd||g}|| dd|   }|d k	r|| }t jj|dd}| j||d}|d k	r|| }t ||g}| jr|| |S )	NT)Ztranspose_br>   r   g     @Zaxistraining)r   matmulr*   r;   r   float32r   r   rB   r>   reshapennZsoftmaxr.   r&   append)r2   inputsrG   qkvattention_mask	head_maskwZdk_r<   r=   boutputsr   r   r   _attnb   s&    
zTFAttention._attnc                 C   sF   t |ddddg}t|}|d d |d |d  g }t ||S )Nr   r   r   r   rC   )r   	transposer   rJ   r2   r   Zx_shapeZnew_x_shaper   r   r   merge_heads   s    zTFAttention.merge_headsc                 C   s@   t |}|d d | j|d | j g }t||}t|dS )NrC   )r   r   r   r   )r   r'   r   rJ   rY   rZ   r   r   r   split_heads   s     zTFAttention.split_headsc                 C   s   |\}}}|  |}tj|ddd\}}}| |}| |}| |}| j|||||g|d}	|	d }
| |
}
| |
}
| j|
|d}
|
g|	dd   }|S )Nr   r   rE   rF   r   r   )r   r   splitr\   rW   r[   r#   r0   )r2   rM   rG   r   rQ   rR   querykeyvalueZattn_outputsarV   r   r   r   call   s    






zTFAttention.call)F)F)F)__name__
__module____qualname__r%   r9   staticmethodrB   rW   r[   r\   rb   __classcell__r   r   r7   r   r   B   s   
	
r   c                       s&   e Zd Z fddZdddZ  ZS )TFMLPc                    sX   t  jf | |j}t|||jdd| _t|||jdd| _t| _t	j
j|j| _d S )Nc_fcr    r#   )r$   r%   n_embdr   r!   ri   r#   r   actr   r+   r,   r-   r/   dropout)r2   r6   r4   r5   r3   r7   r   r   r%      s    zTFMLP.__init__Fc                 C   s,   |  | |}| |}| j||d}|S )NrF   )rk   ri   r#   rl   )r2   r   rG   hh2r   r   r   rb      s    
z
TFMLP.call)Frc   rd   re   r%   rb   rg   r   r   r7   r   rh      s   rh   c                       s(   e Zd Zd fdd	ZdddZ  ZS )TFBlockFc                    sl   t  jf | |j}t||||dd| _tjjj|j	dd| _
td| |dd| _tjjj|j	dd| _d S )Nattnr"   ln_1)epsilonr"      mlpln_2)r$   r%   rj   r   rq   r   r+   r,   ZLayerNormalizationZlayer_norm_epsilonrs   rh   rv   rw   )r2   r)   r4   r*   r5   r3   r7   r   r   r%      s    zTFBlock.__init__c                 C   sf   |\}}}| j |||g|d}|d }| || }| j||d}	| ||	 }
|
g|dd   }|S )NrF   r   r   )rq   rs   rv   rw   )r2   rM   rG   r   rQ   rR   Zoutput_attnra   nrA   rm   rV   r   r   r   rb      s    
zTFBlock.call)F)Fro   r   r   r7   r   rp      s   rp   c                       s>   e Zd Z fddZdd Zdd Zdd ZdddZ  ZS )TFOpenAIGPTMainLayerc                    s   t  j||  j| _ j| _ j| _ j| _ j| _t j j j	dd| _
tjjj j jt j	dd| _tjj j| _ fddt jD | _d S )Ntokens_embedr    positions_embed)Zembeddings_initializerr"   c              	      s$   g | ]}t  j d d|dqS )Tzh_._{})r*   r"   )rp   r)   format).0r?   r4   r   r   
<listcomp>   s     z1TFOpenAIGPTMainLayer.__init__.<locals>.<listcomp>)r$   r%   output_hidden_statesr&   Zn_layernum_hidden_layersZ
vocab_sizerj   r	   r!   rz   r   r+   r,   Z	EmbeddingZn_positionsr
   r{   r-   Z
embd_pdropdropr:   rm   r2   r4   rM   r5   r7   r~   r   r%      s(       zTFOpenAIGPTMainLayer.__init__c                 C   s   | j S r   )rz   r2   r   r   r   get_input_embeddings   s    z)TFOpenAIGPTMainLayer.get_input_embeddingsc                 C   s   t d S r   NotImplementedError)r2   Znew_num_tokensr   r   r   _resize_token_embeddings   s    z-TFOpenAIGPTMainLayer._resize_token_embeddingsc                 C   s   t dS )zz Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        Nr   )r2   Zheads_to_pruner   r   r   _prune_heads   s    z!TFOpenAIGPTMainLayer._prune_headsNFc                    s  t |ttfr|d }t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkstdnrt |ttfr|d	}|d
|}|d|}|d|}|d|}|d|}t|dkstdn|}|d k	r4|d k	r4tdnL|d k	r\t	|}	t
|d|	d g}n$|d k	rxt	|d d }	ntd|d krt
j|	d t
jdt
jd d f }|d k	r|d d t
jt
jd d f }t
|t
j}d| d }nd }|d k	rtnd g| j }t
|dt	|d g}|d kr>| j|dd}| |}
|d k	rzt
|dt	|d g}| j|dd}nd}||
 | }| j||d}|	t	|d g }g }d}t| jD ]X\}}| jr|t
||f }||||| g|d}|d }| jr||d  qt
||}| jr4||f }|f}| jrL||f }| jr|	d d dg t	|d dd    t fdd|D }||f }|S )Nr   r   r   r   ru         Too many inputs.	input_idsrQ   token_type_idsposition_idsrR   inputs_embedszDYou cannot specify both input_ids and inputs_embeds at the same timerC   z5You have to specify either input_ids or inputs_embedsrD   r   g     Z	embeddingmoderF   r   rX   c                 3   s   | ]}t | V  qd S r   )r   rJ   )r}   tZattention_output_shaper   r   	<genexpr>U  s     z,TFOpenAIGPTMainLayer.call.<locals>.<genexpr>)
isinstancetuplelistlenr(   dictr   get
ValueErrorr   r   rJ   r:   Zint32Znewaxisr;   rI   r   r   rz   r{   r   	enumeraterm   r   r&   rL   )r2   rM   rQ   r   r   rR   r   rG   r   Zinput_shapeZposition_embedsZtoken_type_embedshidden_statesZoutput_shapeZall_attentionsZall_hidden_statesr?   blockrV   r   r   r   rb      s    





"






&
zTFOpenAIGPTMainLayer.call)NNNNNF)	rc   rd   re   r%   r   r   r   rb   rg   r   r   r7   r   ry      s   	      ry   c                   @   s   e Zd ZdZeZeZdZdS )TFOpenAIGPTPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    transformerN)	rc   rd   re   __doc__r   Zconfig_class*TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixr   r   r   r   r   Z  s   r   a  

    .. note::
        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`


    Parameters:
        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a
  
    Args:
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
            (if set to :obj:`False`) for evaluation.
zcThe bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.c                       s,   e Zd Z fddZeedd Z  ZS )TFOpenAIGPTModelc                    s&   t  j|f|| t|dd| _d S Nr   rr   r$   r%   ry   r   r   r7   r   r   r%     s    zTFOpenAIGPTModel.__init__c                 K   s   | j |f|}|S )as  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel

        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        )r   )r2   rM   r5   rV   r   r   r   rb     s    zTFOpenAIGPTModel.call)rc   rd   re   r%   r   OPENAI_GPT_INPUTS_DOCSTRINGrb   rg   r   r   r7   r   r     s   r   zOpenAI GPT Model transformer with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                       s4   e Zd Z fddZdd Zeedd Z  ZS )TFOpenAIGPTLMHeadModelc                    s&   t  j|f|| t|dd| _d S r   r   r   r7   r   r   r%     s    zTFOpenAIGPTLMHeadModel.__init__c                 C   s   | j jS r   r   rz   r   r   r   r   get_output_embeddings  s    z,TFOpenAIGPTLMHeadModel.get_output_embeddingsc                 K   s<   | j |f|}|d }| j j|dd}|f|dd  }|S )aa  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel

        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]

        r   linearr   r   Nr   )r2   rM   r5   transformer_outputsr   	lm_logitsrV   r   r   r   rb     s
    zTFOpenAIGPTLMHeadModel.call	rc   rd   re   r%   r   r   r   rb   rg   r   r   r7   r   r     s   r   at  OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
    head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
    The language modeling head has its weights tied to the input embeddings,
    the classification head takes as input the input of a specified classification token index in the input sequence).
c                       s6   e Zd Z fddZdd Zeed	ddZ  ZS )
TFOpenAIGPTDoubleHeadsModelc                    s>   t  j|f|| d|_t|dd| _t||jdd| _d S )Nr   r   rr   multiple_choice_headr    )r$   r%   Z
num_labelsry   r   r   r!   r   r   r7   r   r   r%     s      z$TFOpenAIGPTDoubleHeadsModel.__init__c                 C   s   | j jS r   r   r   r   r   r   r     s    z1TFOpenAIGPTDoubleHeadsModel.get_output_embeddingsNFc	                 C   sd  t |ttfr|d }	t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkr|d n|}t|dkstd	nzt |tr2|d
}	|d|}|d|}|d|}|d|}|d|}|d|}t|dks6td	n|}	|	dk	rJt|	}
nt|dd }
|
d }|	dk	r|t	|	d|fnd}|dk	rt	|d|fnd}|dk	rt	|d|fnd}|dk	rt	|d|fnd}||||||g}| j
||d}|d }t	||
t|dd  }| j
j|dd}| j||g|d}tj|dd}||f|dd  }|S )a#  
        mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
            Index of the classification token in each input sequence.
            Selected in the range ``[0, input_ids.size(-1) - 1[``.

    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
        lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.


    Examples::

        # For example purposes. Not runnable.
        import tensorflow as tf
        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel

        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')

        # Add a [CLS] to the vocabulary (we should train it also!)
        # This option is currently not implemented in TF 2.0
        raise NotImplementedError
        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary

        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
        mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :]  # Batch size 1
        outputs = model(input_ids, mc_token_ids=mc_token_ids)
        lm_prediction_scores, mc_prediction_scores = outputs[:2]

        r   r   r   r   ru   r   r      r   r   rQ   r   r   rR   r   mc_token_idsNrC   rF   r   r   rE   )r   r   r   r   r(   r   r   r   r   rJ   r   rz   r   Zsqueeze)r2   rM   rQ   r   r   rR   r   r   rG   r   Zinput_shapesZ
seq_lengthZflat_input_idsZflat_attention_maskZflat_token_type_idsZflat_position_idsZflat_inputsr   r   r   Z	mc_logitsrV   r   r   r   rb     sT    @


	z TFOpenAIGPTDoubleHeadsModel.call)NNNNNNFr   r   r   r7   r   r   
  s   	       r   )+r   loggingZnumpyr   Z
tensorflowr   Zconfiguration_openair   Z
file_utilsr   r   Zmodeling_tf_utilsr   r   r   r	   r
   r   Ztokenization_utilsr   	getLoggerrc   loggerr   r   r   r+   r,   Z
ActivationZactivationsr   ZACT_FNSZLayerr   rh   rp   ry   r   ZOPENAI_GPT_START_DOCSTRINGr   r   r   r   r   r   r   r   <module>   sL    
^ 
)(1