U
    &c                     @   s  d Z ddlZddlZddlZddlZddlZddlmZ ddl	m
Z
mZmZmZ ddlmZmZmZ eeZddd	d
ddZG dd dejjjZG dd dejjjZG dd dejjjZG dd dejjjZG dd dejjjZG dd dejjjZG dd dejjjZG dd de Z!G dd dejjjZ"G dd  d eZ#d!Z$d"Z%ed#e$G d$d% d%e#Z&ed&e$G d'd( d(e#Z'dS ))z TF 2.0 T5 model.     N   )T5Config)DUMMY_INPUTS
DUMMY_MASKadd_start_docstrings add_start_docstrings_to_callable)TFPreTrainedModelTFSharedEmbeddings
shape_listz/https://cdn.huggingface.co/t5-small-tf_model.h5z.https://cdn.huggingface.co/t5-base-tf_model.h5z/https://cdn.huggingface.co/t5-large-tf_model.h5z,https://cdn.huggingface.co/t5-3b-tf_model.h5z-https://cdn.huggingface.co/t5-11b-tf_model.h5)zt5-smallzt5-basezt5-largezt5-3bzt5-11bc                       s2   e Zd Zd fdd	Z fddZdd Z  ZS )	TFT5LayerNormư>c                    s   t  jf | || _dS )zg Construct a layernorm module in the T5 style
            No bias and no substraction of mean.
        N)super__init__variance_epsilon)selfepsilonkwargs	__class__ ?/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_t5.pyr   1   s    zTFT5LayerNorm.__init__c                    s(   | j d|d fdd| _t | dS )z"Build shared word embedding layer weightZones)shapeZinitializerN)Z
add_weightr   r   build)r   input_shaper   r   r   r   8   s    zTFT5LayerNorm.buildc                 C   s:   t jjt j|ddd}|t j|| j  }| j| S )Nr   T)axisZkeepdims)tfmathZreduce_meanZsquareZrsqrtr   r   )r   xZvariancer   r   r   call=   s    zTFT5LayerNorm.call)r   )__name__
__module____qualname__r   r   r    __classcell__r   r   r   r   r   0   s   r   c                       s&   e Zd Z fddZdddZ  ZS )TFT5DenseReluDensec                    s`   t  jf | tjjj|jddd| _tjjj|jddd| _	tjj
|j| _tjjj| _d S )NFwiZuse_biasnamewo)r   r   r   keraslayersDenseZd_ffr&   d_modelr)   Dropoutdropout_ratedropoutZactivationsZreluactr   configr   r   r   r   r   D   s
    zTFT5DenseReluDense.__init__Fc                 C   s0   |  |}| |}| j||d}| |}|S Ntraining)r&   r1   r0   r)   )r   hidden_statesr6   hr   r   r   r    K   s
    


zTFT5DenseReluDense.call)Fr!   r"   r#   r   r    r$   r   r   r   r   r%   C   s   r%   c                       s&   e Zd Z fddZdddZ  ZS )TFT5LayerFFc                    sB   t  jf | t|dd| _t|jdd| _tjj	
|j| _d S )NDenseReluDenser(   
layer_normr   r(   )r   r   r%   r;   r   layer_norm_epsilonr=   r   r*   r+   r.   r/   r0   r2   r   r   r   r   T   s    zTFT5LayerFF.__init__Fc                 C   s.   |  |}| j||d}|| j||d }|S r4   )r=   r;   r0   )r   r7   r6   norm_xylayer_outputr   r   r   r    Z   s    
zTFT5LayerFF.call)Fr9   r   r   r   r   r:   S   s   r:   c                	       sN   e Zd Ze Zd fdd	Zdd Zedd	d
Z	dd Z
dddZ  ZS )TFT5AttentionFc                    s   t  jf | ttj| _|j| _|| _|j| _|j	| _	|j
| _
|j| _|j| _| j| j | _tjjj| jddd| _tjjj| jddd| _tjjj| jddd| _tjjj| j
ddd| _tjj|j| _| jrtjjj| j	| jdd| _t | _d S )	NFqr'   kvorelative_attention_biasr<   )r   r   nextrC   NEW_IDZlayer_id
is_decoderhas_relative_attention_biasoutput_attentionsrelative_attention_num_bucketsr-   d_kvZ	num_headsn_heads	inner_dimr   r*   r+   r,   rD   rE   rF   rG   r.   r/   r0   Z	EmbeddingrH   setZpruned_headsr   r3   rL   r   r   r   r   r   d   s,      zTFT5Attention.__init__c                 C   s   t d S NNotImplementedError)r   Zheadsr   r   r   prune_heads~   s    zTFT5Attention.prune_headsT       c           	   	   C   s   d}|  }|rF|d }|t jt j|dt j| 7 }t j|}nt j|d}|d }t j||}|t jt jt j|t j	| t||  ||  t j }t j
||d }|t |||7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention.
        The relative position is defined as memory_position - query_position, i.e.
        the distance in tokens from the attending position to the attended-to
        position.  If bidirectional=False, then positive relative positions are
        invalid.
        We use smaller buckets for small absolute relative_position and larger buckets
        for larger absolute relative_positions.  All relative positions >=max_distance
        map to the same bucket.  All relative positions <=-max_distance map to the
        same bucket.  This should allow for more graceful generalization to longer
        sequences than the model has been trained on.
        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer
        Returns:
            a Tensor with the same shape as relative_position, containing int32
            values in the range [0, num_buckets)
        r      r   )r   Zdtypescastr   ZlessZint32absmaximumlogfloat32Zminimumwhere)	relative_positionbidirectionalnum_bucketsZmax_distanceretnZ	max_exactZis_smallZval_if_larger   r   r   _relative_position_bucket   s(    "z'TFT5Attention._relative_position_bucketc                 C   st   t |dddf }t |dddf }|| }| j|| j | jd}| |}t jt |dddgdd}|S )z' Compute binned relative position bias N)rb   rc   rZ   r   r   r   )r   rangerf   rK   rN   rH   Zexpand_dims	transpose)r   qlenklenZcontext_positionZmemory_positionra   Z	rp_bucketvaluesr   r   r   compute_bias   s      
zTFT5Attention.compute_biasNc                    s  t |\ }}|dk	rhjdks(tdt|dksFtdt||dkrb|t |d d  n|}n|}|dkrz|}nt |d } fdd	} fd
d}||}|dkr||}||}n.|dkr| }}||}||}|dk	rN|dkrF|\}}tj	||gdd}tj	||gdd}n|\}}t
|	rxt|	drtt|	 }	nd}	jr|	dkr||ff}nd}td||}|dkr
jstd||}|dk	r|ddddddddf }|dk	r
|| }||7 }tjj|dd}j||
d}|dk	rB|| }t||}||}|}|f| }jr|||f }jr||f }|S )zd
        Self-attention (if kv is None) or attention over source sentence (provided by kv).
        NTz*Encoder cannot cache past key value statesrZ   zSpast_key_value_state should have 2 past states: keys and values. Got {} past statesr   r   c                    s"   t jt |  djjfddS )z  projection r   r   rZ   r      perm)r   ri   reshaperP   rO   r   bsr   r   r   r      s    z!TFT5Attention.call.<locals>.shapec                    s   t t j| dd djfS )z  compute context rn   rp   r   )r   rr   ri   rQ   rs   rt   r   r   unshape   s    z#TFT5Attention.call.<locals>.unshaperg   numpyrT   zbnqd,bnkd->bnqkzANo position_bias provided and no weights to compute position_biasr   r5   )r
   rK   AssertionErrorlenformatrD   rE   rF   r   concatZ	is_tensorhasattrboolrw   ZeinsumrL   
ValueErrorrm   nnZsoftmaxr0   matmulrG   rM   )r   inputmaskkvposition_biascachepast_key_value_state	head_maskquery_length	use_cacher6   rj   ZdimZ	real_qlenrk   r   rv   rD   rE   rF   Zk_Zv_present_key_value_stateZscoresweightscontextoutputsr   rt   r   r       sx    
"




 





zTFT5Attention.call)F)TrX   rY   )	NNNNNNNFF)r!   r"   r#   	itertoolscountrJ   r   rW   staticmethodrf   rm   r    r$   r   r   r   r   rC   a   s    -         rC   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	TFT5LayerSelfAttentionFc                    sD   t  jf | t||dd| _t|jdd| _tjj	
|j| _d S )NSelfAttentionrL   r(   r=   r>   )r   r   rC   r   r   r?   r=   r   r*   r+   r.   r/   r0   rS   r   r   r   r   .  s      zTFT5LayerSelfAttention.__init__Nc              	   C   sR   |  |}| j|||||||d}	|	d }
|| j|
|d }|f|	dd   }|S )N)r   r   r   r   r   r6   r   r5   r   )r=   r   r0   )r   r7   attention_maskr   r   r   r   r6   r@   attention_outputrA   rB   r   r   r   r   r    6  s    

	zTFT5LayerSelfAttention.call)F)NNNNFFr9   r   r   r   r   r   -  s         r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	TFT5LayerCrossAttentionFc                    sD   t  jf | t||dd| _t|jdd| _tjj	
|j| _d S )NEncDecAttentionr   r=   r>   )r   r   rC   r   r   r?   r=   r   r*   r+   r.   r/   r0   rS   r   r   r   r   Q  s      z TFT5LayerCrossAttention.__init__Nc
                 C   sV   |  |}
| j|
||||||||	d	}|d }|| j||	d }|f|dd   }|S )N)r   r   r   r   r   r   r   r6   r   r5   r   )r=   r   r0   )r   r7   r   r   r   r   r   r   r   r6   r@   r   rA   rB   r   r   r   r   r    Y  s     
zTFT5LayerCrossAttention.call)F)NNNNNFFr9   r   r   r   r   r   P  s          r   c                	       s(   e Zd Zd fdd	ZdddZ  ZS )		TFT5BlockFc                    sr   t  jf | |j| _g | _| jt||dd | jrN| jt||dd | jt|dt	| jd d S )Nz	layer_._0r   z	layer_._1z
layer_._{}r<   )
r   r   rK   layerappendr   r   r:   rz   ry   rS   r   r   r   r   x  s      zTFT5Block.__init__Nc                 C   sZ  |d k	rr| j std|d kr"dnd}d||dkr8dndt|}t||ksXt||d d }|dd  }nd\}}| jd ||||||	|
d	}|d d \}}|dd  }| j r0|d k	r0|d k	rt|d d }nd }| jd
 ||||||||	|
d	}|d }|d k	r ||d
  }||dd   }| jd ||
d}|f}||f | }|S )Nz,Only decoder can use `past_key_value_states`rZ      zcThere should be {} past states. 2 (past / key) for self attention.{} Got {} past key / value statesz"2 (past / key) for cross attention )NNr   )r   r   r   r   r   r6   r   )r   r   r   r   r   r   r   r6   r   r5   )rK   rx   rz   ry   r   r
   )r   r7   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   r   r   r6   Z"expected_num_past_key_value_stateserror_messageZself_attn_past_key_value_stateZcross_attn_past_key_value_stateZself_attention_outputsr   Zattention_outputsr   Zcross_attention_outputsr   r   r   r   r      s\    	
zTFT5Block.call)F)	NNNNNNNFFr9   r   r   r   r   r   w  s            r   c                   @   s.   e Zd ZdZd
ddZdddZddd	ZdS )_NoLayerEmbedTokensa  
     this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer'
     class to avoid problem with weight restoring. Also it makes sure that the layer is
     called from the correct scope to avoid problem with saving/storing the correct weights
    Nc                 C   s   || _ || _d S rT   )_layer_abs_scope_name)r   r   abs_scope_namer   r   r   r     s    z_NoLayerEmbedTokens.__init__	embeddingc                 C   sv   | j d kr| j||S tjjj| j dd@}t|j( | j||W  5 Q R  W  5 Q R  S Q R X W 5 Q R X d S NF)Zauxiliary_name_scope)	r   r   r    r   compatv1variable_scope
name_scopeoriginal_name_scoper   inputsmoder   r   r   r   r      s
    
z_NoLayerEmbedTokens.callc                 C   sr   | j d kr| ||S tjjj| j dd>}t|j& | ||W  5 Q R  W  5 Q R  S Q R X W 5 Q R X d S r   )r   r   r   r   r   r   r   r   r   r   r   r   __call__  s
    
z_NoLayerEmbedTokens.__call__)N)r   )r   )r!   r"   r#   __doc__r   r    r   r   r   r   r   r     s   

	r   c                       sP   e Zd Zd fdd	Zdd Zdd Zdd	 Zd
d Zdd ZdddZ	  Z
S )TFT5MainLayerNc                    sz   t  jf |  j| _ j| _|| _ j| _ | _ j| _ fddt	 jD | _
t jdd| _tjj j| _d S )Nc                    s(   g | ] }t  t|d kd|dqS )r   z
block_._{}r   )r   r}   rz   ).0ir3   r   r   
<listcomp>  s   z*TFT5MainLayer.__init__.<locals>.<listcomp>final_layer_normr>   )r   r   rM   output_hidden_statesembed_tokensrK   r3   Z
num_layersnum_hidden_layersrh   blockr   r?   r   r   r*   r+   r.   r/   r0   )r   r3   r   r   r   r   r   r     s    
zTFT5MainLayer.__init__c                 C   s   | j S rT   r   r   r   r   r   get_input_embeddings  s    z"TFT5MainLayer.get_input_embeddingsc                 C   s   | j S rT   r   r   r   r   r   get_output_embeddings  s    z#TFT5MainLayer.get_output_embeddingsc                 C   s
   || _ d S rT   r   )r   r   r   r   r   set_embed_tokens  s    zTFT5MainLayer.set_embed_tokensc                 C   s   t d S rT   rU   )r   Znew_num_tokensr   r   r   _resize_token_embeddings  s    z&TFT5MainLayer._resize_token_embeddingsc                 C   s   t d S rT   rU   )r   Zheads_to_pruner   r   r   _prune_heads  s    zTFT5MainLayer._prune_headsFc
           !      C   sv  |d k	r|d k	rt dnH|d k	r@t|}
t|d|
d f}n"|d k	rZt|d d }
nt d|d kr| jd k	s|td| |}|
\}}|d k	r|dkstd|
|dft|d d d | }n|}|d krt||fd}| jr |d kr |d k	r t|d }t||fd}|d kr:d gt	| j
 }tj|tjd	}t	t|}|d
kr~|d d d d d d d f }n|dkr\| jrDt|}tt|d d d d f ||df|d d d d f }tj|tjd	}|d d d d d d d f |d d d d d d f  }|d d k	r\|d d d d dd d d f }n|d d d d d d f }d| d }| jr|d k	rtj|tjd	}t	t|}|d
kr|d d d d d d d f }|dkr|d d d d d d f }d| d }nd }|d k	r tnd g| j }d}d}d}d }d }| j||	d}tt| j
|D ]\}\}}| jr\||f }||||||||| |||	d
}|d d \}}|dkr|| jrd
nd }| jr|d k	r|| jrdnd
 }||f }| jr>||d f }q>| |}| j||	d}| jr||f }|f} |dkrN| jsDtd| | |f } | jr`| |f } | jrr| |f } | S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsz;You have to intialize the model with valid token embeddingsr   zCInput shape is {}, but should be {} when using past_key_value_satesr   rZ   )Zdtypero   g      ?g    er   r5   )	r   r   r   r   r   r   r   r   r6   r   Tz@`use_cache` can only be set to `True` if {} is used as a decoder)r~   r
   r   rr   r   rx   rz   fillrK   ry   r   r[   r_   rh   Z
less_equalZtilerV   r   r0   	enumeratezipr   rM   r   )!r   	input_idsr   r   r   inputs_embedsr   past_key_value_statesr   r6   r   Z
batch_sizeZ
seq_lengthZmask_seq_lengthZencoder_seq_lengthZnum_dims_attention_maskZextended_attention_maskZseq_idsZcausal_maskZnum_dims_encoder_attention_maskZencoder_extended_attention_maskZpresent_key_value_statesZall_hidden_statesZall_attentionsr   r   r7   r   Zlayer_moduler   Zlayer_outputsr   r   r   r   r   r      s    

 



 4"











zTFT5MainLayer.call)N)NNNNNNFF)r!   r"   r#   r   r   r   r   r   r   r    r$   r   r   r   r   r     s           r   c                   @   s(   e Zd ZdZeZeZdZe	dd Z
dS )TFT5PreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    Ztransformerc                 C   s$   t t}t t}|||d}|S )N)r   decoder_input_idsdecoder_attention_mask)r   Zconstantr   r   )r   r   Z
input_maskdummy_inputsr   r   r   r     s    

z TFT5PreTrainedModel.dummy_inputsN)r!   r"   r#   r   r   Zconfig_class"TF_T5_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixpropertyr   r   r   r   r   r     s   r   a      The T5 model was proposed in
    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.

    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.

    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
        https://arxiv.org/abs/1910.10683

    .. _`tf.keras.Model`:
        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model

    Note on the model inputs:
        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :

        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Parameters:
        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        inputs are usually used as a `dict` (see T5 description above for more information) containing all the following.

        inputs (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
            T5 is a model with relative position embeddings so you should be able to pad the inputs on
            the right or the left.
            Indices can be obtained using :class:`transformers.T5Tokenizer`.
            To know more on how to prepare :obj:`input_ids` for pre-training take a look at
            `T5 Training <./t5.html#training>`_ .
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`):
            Provide for sequence to sequence training. T5 uses the pad_token_id as the starting token for decoder_input_ids generation.
            If `decoder_past_key_value_states` is used, optionally only the last `decoder_input_ids` have to be input (see `decoder_past_key_value_states`).
        attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        encoder_outputs (:obj:`tuple(tuple(tf.FloatTensor)`, `optional`, defaults to :obj:`None`):
            Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`)
            `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder.
            Used in the cross-attention of the decoder.
        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`):
            Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default.
        decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains pre-computed key and value hidden-states of the attention blocks.
            Can be used to speed up decoding.
            If `decoder_past_key_value_states` are used, the user can optionally input only the last `decoder_input_ids`
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
            If `use_cache` is True, `decoder_past_key_value_states` are returned and can be used to speed up decoding (see `decoder_past_key_value_states`).
        inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        decoder_inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
            To know more on how to prepare :obj:`decoder_input_ids` for pre-training take a look at
            `T5 Training <./t5.html#training>`_ .
        head_mask: (:obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
z[The bare T5 Model transformer outputting raw hidden-stateswithout any specific head on top.c                       sL   e Zd Z fddZdd Zdd Zdd Zd	d
 Zee	dd Z
  ZS )	TFT5Modelc              	      s   t  j|f|| t|j|jdd| _tjj	d}W 5 Q R X t
| j|d}t|}t||dd| _t|}d|_t||dd| _d S Nsharedr<   )r   encoderTdecoder)r   r   r	   
vocab_sizer-   r   r   r   r   r   r   copydeepcopyr   r   rK   r   r   r3   r   r   Zshared_abs_scope_namer   Zencoder_configZdecoder_configr   r   r   r   =  s    


zTFT5Model.__init__c                 C   s   | j S rT   r   r   r   r   r   r   N  s    zTFT5Model.get_input_embeddingsc                 C   s   | j S rT   r   r   r   r   r   r   Q  s    zTFT5Model.get_output_embeddingsc                 C   s   | j S rT   r   r   r   r   r   get_encoderT  s    zTFT5Model.get_encoderc                 C   s   | j S rT   r   r   r   r   r   get_decoderW  s    zTFT5Model.get_decoderc              
   K   sN  t |tr|| n||d< |dd}|dd}|dd}|dd}|dd}|dd}|dd}	|d	d}
|d
d}|dd}|dkr| j||||d}|d }|
dk	r|dk	r|ddddf }|	dk	r|	ddddf }	| j|||	|
||||d}|dkrF||d ff}|dd | |dd  }|| S )aT	  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs.
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
            If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
        decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
            Contains pre-computed key and value hidden-states of the attention blocks.
            Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
            Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import T5Tokenizer, TFT5Model

        tokenizer = T5Tokenizer.from_pretrained('t5-small')
        model = TFT5Model.from_pretrained('t5-small')
        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
        outputs = model(input_ids, decoder_input_ids=input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        r   Nr   r   encoder_outputsr   r   decoder_inputs_embedsdecoder_past_key_value_statesr   Tr   r   r   r   r   r   r   r   r   r   r   r   r   r   rZ   )
isinstancedictupdategetr   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r7   decoder_outputspastr   r   r   r    Z  sP    $
   
zTFT5Model.call)r!   r"   r#   r   r   r   r   r   r   T5_INPUTS_DOCSTRINGr    r$   r   r   r   r   r   8  s   r   z1T5 Model with a `language modeling` head on top. c                       s\   e Zd Z fddZdd Zdd Zdd Zd	d
 Zee	dd Z
dd Zdd Z  ZS )TFT5ForConditionalGenerationc              	      s   t  j|f|| |j| _t|j|jdd| _tjj	
d}W 5 Q R X t| j|d}t|}t||dd| _t|}d|_t||dd| _d S r   )r   r   r-   	model_dimr	   r   r   r   r   r   r   r   r   r   r   r   rK   r   r   r   r   r   r     s    


z%TFT5ForConditionalGeneration.__init__c                 C   s   | j S rT   r   r   r   r   r   r     s    z1TFT5ForConditionalGeneration.get_input_embeddingsc                 C   s   | j S rT   r   r   r   r   r   r     s    z2TFT5ForConditionalGeneration.get_output_embeddingsc                 C   s   | j S rT   r   r   r   r   r   r     s    z(TFT5ForConditionalGeneration.get_encoderc                 C   s   | j S rT   r   r   r   r   r   r     s    z(TFT5ForConditionalGeneration.get_decoderc              
   K   s  t |tr|| n||d< |dd}|dd}|dd}|dd}|dd}|dd}|dd	}	|d
d}
|dd}|dd}|dkr| j|||
|d}|d }|dk	r|dk	r|ddddf }|dk	r|ddddf }| j||||||||	d}|	d	krF||d ff}|dd | |dd  }|d | jd  }|  }||dd}|f|dd  }|| S )a^
  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs.
        loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided):
            Classification loss (cross entropy).
        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
            Contains pre-computed key and value hidden-states of the attention blocks.
            Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
            Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention.

    Examples::

        from transformers import T5Tokenizer, TFT5ForConditionalGeneration

        tokenizer = T5Tokenizer.from_pretrained('t5-small')
        model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
        outputs = model(input_ids, decoder_input_ids=input_ids)
        prediction_scores = outputs[0]

        tokenizer = T5Tokenizer.from_pretrained('t5-small')
        model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
        input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
        model.generate(input_ids)

        r   Nr   r   r   r   r   r   Tr   r   r   r   r   r   r   r   rZ   g      Zlinear)r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r7   r   r   Zsequence_outputr   Z	lm_logitsr   r   r   r      sX    )
   
z!TFT5ForConditionalGeneration.callc                 K   sL   |d k	st dt|dk r(|d  }}n|d |d  }}d |||||dS )Nz*past has to be defined for encoder_outputsrZ   r   r   )r   r   r   r   r   r   )rx   ry   )r   r   r   r   r   r   r   r   r   r   r   prepare_inputs_for_generation7  s    z:TFT5ForConditionalGeneration.prepare_inputs_for_generationc                 C   s   t |dk rtd |S |d }|d f}d}|D ]^}d}|D ]}|t||f }q@t|d t|d ksttt |t |kst||f }q4||f S )NrZ   zHYou might want to consider setting `use_cache=True` to speed up decodingr   r   r   )ry   loggerwarningr   Zgatherr
   rx   )r   r   Zbeam_idxZdecoder_pastZreordered_decoder_pastZlayer_past_statesZreordered_layer_past_statesZlayer_past_stater   r   r   _reorder_cacheI  s    

z+TFT5ForConditionalGeneration._reorder_cache)r!   r"   r#   r   r   r   r   r   r   r   r    r   r   r$   r   r   r   r   r     s   
cr   )(r   r   r   loggingr   Z
tensorflowr   Zconfiguration_t5r   Z
file_utilsr   r   r   r   Zmodeling_tf_utilsr   r	   r
   	getLoggerr!   r   r   r*   r+   ZLayerr   r%   r:   rC   r   r   r   objectr   r   r   ZT5_START_DOCSTRINGr   r   r   r   r   r   r   <module>   sH   
 M#'_" V$2v