U
    &c
k                     @   s>  d Z ddlZddlZddlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZ ddlmZ eeZdd	iZd
d Zdd Zd%ddZG dd dejjjZd&ddZG dd dejjjZeG dd dejjjZG dd deZ dZ!dZ"e	de!G dd de Z#G d d! d!ejjjZ$e	d"e!G d#d$ d$e Z%dS )'z TF 2.0 CTRL model.    N   )
CTRLConfig)add_start_docstrings add_start_docstrings_to_callable)TFPreTrainedModelTFSharedEmbeddingskeras_serializable
shape_list)BatchEncodingZctrlz+https://cdn.huggingface.co/ctrl-tf_model.h5c                 C   s*   dt dd|d  t |  }| | S )Nr   i'     )nppowerfloat32)posid_model_sizeZangle_rates r   A/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_ctrl.py
angle_defn#   s    "r   c                 C   s   t t| d d tjf t|tjd d f |}t|d d dd df }t|d d dd df }tjtj||gddtj	d}|S )Nr   r   r   ZaxisZdtype)
r   r   ZarangenewaxissincostfcastZconcatenater   )positionr   Z
angle_radsZsinesZcosinespos_encodingr   r   r   positional_encoding(   s
    4r   c                 C   s   t j| |dd}t t|d t j}|t j| }|d k	rJ||d 7 }|d k	rZ|| }t jj|dd}	|d k	rz|	| }	t |	|}
|
|	fS )NT)Ztranspose_br        r   )	r   matmulr   r	   r   mathsqrtnnZsoftmax)qkvmaskattention_mask	head_maskZ	matmul_qkZdkZscaled_attention_logitsZattention_weightsoutputr   r   r   scaled_dot_product_attention4   s    r,   c                       s0   e Zd Zd fdd	Zdd Zd	ddZ  ZS )
TFMultiHeadAttentionFc                    s   t  jf | || _|| _|| _t|| j | _tjj	j
|dd| _tjj	j
|dd| _tjj	j
|dd| _tjj	j
|dd| _d S )NWqnameWkWvdense)super__init__output_attentions	num_headsr   intdepthr   keraslayersDenser.   r1   r2   r3   )selfr   r7   r6   kwargs	__class__r   r   r5   N   s    zTFMultiHeadAttention.__init__c                 C   s.   t ||d| j| jf}t j|ddddgdS )Nr   r   r   r      perm)r   reshaper7   r9   	transpose)r=   x
batch_sizer   r   r   split_into_heads\   s    z%TFMultiHeadAttention.split_into_headsc                 C   sV  |\}}}}}}}	}
t |d }| |}| |}| |}| ||}| ||}| ||}|d k	rtj|dd\}}tj||fdd}tj||fdd}t|
rt	|
drt
|
 }
nd}
|
dkrtj||fdd}nd}t||||||	}tj|d dddd	gd
}|d }t||d| jf}| |}||f}| jrR||f }|S )Nr   r   numpyTNr   r   rA   rB   r   )r	   r.   r1   r2   rH   r   ZunstackconcatZ	is_tensorhasattrboolrJ   stackr,   rE   rD   r   r3   r6   )r=   inputstrainingr'   r&   r%   r(   
layer_pastr)   r*   	use_cacherG   Zpast_keyZ
past_valuepresentr+   Zscaled_attentionZattnZoriginal_size_attentionoutputsr   r   r   call`   s8    






zTFMultiHeadAttention.call)F)F)__name__
__module____qualname__r5   rH   rV   __classcell__r   r   r?   r   r-   M   s   r-    c                 C   s2   t jjt jjj|dddt jjj| ddgddS )NZrelu0)Z
activationr0   2r/   ffn)r   r:   Z
Sequentialr;   r<   )r   dffr0   r   r   r   point_wise_feed_forward_network   s    $r`   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
TFEncoderLayer皙?ư>Fc                    s|   t  jf | t|||dd| _t||dd| _tjjj	|dd| _
tjjj	|dd| _tjj|| _tjj|| _d S )Nmulti_head_attentionr/   r^   
layernorm1epsilonr0   
layernorm2)r4   r5   r-   rd   r`   r^   r   r:   r;   LayerNormalizationre   rh   Dropoutdropout1dropout2)r=   r   r7   r_   Zratelayer_norm_epsilonr6   r>   r?   r   r   r5      s       zTFEncoderLayer.__init__c              	   C   s   |\}}}}}}|  |}	| j|	|	|	|||||g|d}
|
d }| j||d}|| }| |}| |}| j||d}|| }|f|
dd   }|S )NrQ   r   r   )re   rd   rk   rh   r^   rl   )r=   rP   rQ   rF   r(   rR   r)   r*   rS   normedZattn_outputsZattn_outputZout1Zout2Z
ffn_outputrU   r   r   r   rV      s    
 

zTFEncoderLayer.call)rb   rc   F)F)rW   rX   rY   r5   rV   rZ   r   r   r?   r   ra      s
        ra   c                       sB   e Zd ZeZ fddZdd Zdd Zdd ZdddZ	  Z
S )TFCTRLMainLayerc                    s   t  jf |  j| _ j| _ j| _ j| _t j	| j| _
t j j jdd| _tjj j| _ fddt jD | _tjjj jdd| _d S )Nw)initializer_ranger0   c                    s4   g | ],}t  j j j j j jd |dqS )zh_._{}r/   )ra   n_embdZn_headr_   Zresid_pdroprm   r6   format).0r   configr   r   
<listcomp>   s   
z,TFCTRLMainLayer.__init__.<locals>.<listcomp>	layernormrf   )r4   r5   output_hidden_statesr6   rs   r   Zn_layer
num_layersr   Zn_positionsr   r   
vocab_sizerr   rq   r   r:   r;   rj   Z
embd_pdropdropoutrangehri   rm   ry   )r=   rw   r>   r?   rv   r   r5      s"       

zTFCTRLMainLayer.__init__c                 C   s   | j S rK   )rq   r=   r   r   r   get_input_embeddings   s    z$TFCTRLMainLayer.get_input_embeddingsc                 C   s   t d S rK   NotImplementedError)r=   Znew_num_tokensr   r   r   _resize_token_embeddings   s    z(TFCTRLMainLayer._resize_token_embeddingsc                 C   s   t dS )z~ Prunes heads of the model.
                heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        Nr   )r=   Zheads_to_pruner   r   r   _prune_heads   s    zTFCTRLMainLayer._prune_headsNTFc
                    sH  t |ttfr|d }
t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkr|d n|}t|dkr|d n|}t|d	kstd
nt |ttfrZ|d}
|d|}|d|}|d|}|d|}|d|}|d|}|d|}t|d	ks^td
n|}
|d k	r|
d k	r|
d d dd f }
|d k	r|d d dd f }|d k	r|d d dd f }|
d k	r|d k	rtdnL|
d k	rt	|
}t
|
d|d g}
n$|d k	r$t	|d d }ntd|d krLd}d gt| j }nt	|d d d }|d krt
j||d | t
jdt
jd d f }t
||d dg}|d k	r|d d t
jt
jd d f }t
|t
j}d| d }nd }|d k	rtnd g| j }|d k	rVt
|dt	|d g}| j|dd}|t
jt
| jt
j9 }nd}t
|dt	|d g}|d kr| j|
dd}|d }dt
jt
||fdd }|t
jt
| jt
j9 }t
| j|}|| | }| j||	d}|t	|d g }d}d}g }tt | j|D ]~\}\}}| j!rH|t
||f }||||||| |g|	d}|d d \}}|dkr||f }| j"r"|#|d  q"| $|}t
||}| j!r||f }|f}|dkr||f }| j!r||f }| j"rD|d d dg t	|d dd    t fdd |D }||f }|S )!Nr   r   r   rA                  zToo many inputs.	input_idspastr)   token_type_idsposition_idsr*   inputs_embedsrS   r   zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsrI   r   g      ?r    Z	embeddingmodern   r   Tc                 3   s   | ]}t | V  qd S rK   )r   rD   )ru   tZattention_output_shaper   r   	<genexpr>s  s     z'TFCTRLMainLayer.call.<locals>.<genexpr>)%
isinstancetuplelistlenAssertionErrordictr
   get
ValueErrorr	   r   rD   r   r~   Zint32r   Ztiler   r   r   r{   rq   r"   r#   r   ZlinalgZ	band_partZonesZgatherr   r}   	enumerateziprz   r6   appendry   )r=   rP   r   r)   r   r   r*   r   rS   rQ   r   input_shapeZpast_lengthZtoken_type_embedsZseq_lenr(   Z
pos_embedshidden_statesZoutput_shapeZpresentsZall_hidden_statesZall_attentionsr   r   rR   rU   rT   r   r   r   rV      s    









(










&
zTFCTRLMainLayer.call)NNNNNNTF)rW   rX   rY   r   config_classr5   r   r   r   rV   rZ   r   r   r?   r   rp      s   	        rp   c                   @   s   e Zd ZdZeZeZdZdS )TFCTRLPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    transformerN)	rW   rX   rY   __doc__r   r   $TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixr   r   r   r   r   x  s   r   a  

    .. note::
        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Parameters:
        config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
            If `past` is used, optionally only the last `input_ids` have to be input (see `past`).

            Indices can be obtained using :class:`transformers.CTRLTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `past` output below). Can be used to speed up sequential decoding.
            If `past` is used, the user can optionally input only the last `input_ids`
            (those that don't have their past given to this model) of shape :obj:`(batch_size, 1)`
            instead of all `input_ids` of shape :obj:`(batch_size, sequence_length)`.
        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
            If `past` is used, optionally only the last `token_type_ids` have to be input (see `past`).

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
            If `past` is used, optionally only the last `input_embeds` have to be input (see `past`).
        use_cache (:obj:`bool`):
            If `use_cache` is True, `past` key value states are returned and
            can be used to speed up decoding (see `past`). Defaults to `True`.
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
            (if set to :obj:`False`) for evaluation.
z^The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.c                       s,   e Zd Z fddZeedd Z  ZS )TFCTRLModelc                    s&   t  j|f|| t|dd| _d S )Nr   r/   )r4   r5   rp   r   r=   rw   rP   r>   r?   r   r   r5     s    zTFCTRLModel.__init__c                 K   s   | j |f|}|S )a(  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import CTRLTokenizer, TFCTRLModel

        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
        model = TFCTRLModel.from_pretrained('ctrl')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        )r   )r=   rP   r>   rU   r   r   r   rV     s    #zTFCTRLModel.call)rW   rX   rY   r5   r   CTRL_INPUTS_DOCSTRINGrV   rZ   r   r   r?   r   r     s   r   c                       s0   e Zd Z fddZ fddZdd Z  ZS )TFCTRLLMHeadc                    s    t  jf | |j| _|| _d S rK   )r4   r5   r|   input_embeddings)r=   rw   r   r>   r?   r   r   r5     s    zTFCTRLLMHead.__init__c                    s(   | j | jfdddd| _t | d S )NzerosTbias)shapeZinitializerZ	trainabler0   )Z
add_weightr|   r   r4   build)r=   r   r?   r   r   r   
  s    zTFCTRLLMHead.buildc                 C   s   | j |dd}|| j }|S )NZlinearr   )r   r   )r=   r   r   r   r   rV     s    
zTFCTRLLMHead.call)rW   rX   rY   r5   r   rV   rZ   r   r   r?   r   r     s   r   z~The CTRL Model transformer with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                       s<   e Zd Z fddZdd Zdd Zeedd Z  Z	S )	TFCTRLLMHeadModelc                    s:   t  j|f|| t|dd| _t|| jjdd| _d S )Nr   r/   lm_head)r4   r5   rp   r   r   rq   r   r   r?   r   r   r5     s    zTFCTRLLMHeadModel.__init__c                 C   s   | j jS rK   )r   r   r   r   r   r   get_output_embeddings   s    z'TFCTRLLMHeadModel.get_output_embeddingsc                 K   s,   |rt |d d df d}|||d dS )Nr   rS   )rP   r   rS   )r   Zexpand_dims)r=   rP   r   r>   r   r   r   prepare_inputs_for_generation#  s    z/TFCTRLLMHeadModel.prepare_inputs_for_generationc                 K   s6   | j |f|}|d }| |}|f|dd  }|S )a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import CTRLTokenizer, TFCTRLLMHeadModel

        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
        model = TFCTRLLMHeadModel.from_pretrained('ctrl')

        input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)])
        outputs = model(input_ids)
        loss, logits = outputs[:2]

        r   r   N)r   r   )r=   rP   r>   Ztransformer_outputsr   Z	lm_logitsrU   r   r   r   rV   *  s
    $
zTFCTRLLMHeadModel.call)
rW   rX   rY   r5   r   r   r   r   rV   rZ   r   r   r?   r   r     s
   r   )NN)r[   )&r   loggingrJ   r   Z
tensorflowr   Zconfiguration_ctrlr   Z
file_utilsr   r   Zmodeling_tf_utilsr   r   r   r	   Ztokenization_utilsr
   	getLoggerrW   loggerr   r   r   r,   r:   r;   ZLayerr-   r`   ra   rp   r   ZCTRL_START_DOCSTRINGr   r   r   r   r   r   r   r   <module>   s@   

>
$ B
5,