U
    &c                     @   s  d Z ddlZddlZddlZddlZddlZddlm  mZ	 ddlmZ ddl
mZ ddlmZ ddlmZmZmZmZ ddlmZmZ eeZd	d
ddddZdd ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd  d eZ#G d!d" d"e#Z$d#Z%d$Z&ed%e%G d&d' d'e#Z'ed(e%G d)d* d*e#Z(dS )+z PyTorch T5 model.     N)nn)CrossEntropyLoss   )T5Config)DUMMY_INPUTS
DUMMY_MASKadd_start_docstrings add_start_docstrings_to_callable)PreTrainedModelprune_linear_layerz5https://cdn.huggingface.co/t5-small-pytorch_model.binz4https://cdn.huggingface.co/t5-base-pytorch_model.binz5https://cdn.huggingface.co/t5-large-pytorch_model.binz2https://cdn.huggingface.co/t5-3b-pytorch_model.binz3https://cdn.huggingface.co/t5-11b-pytorch_model.bin)zt5-smallzt5-basezt5-largezt5-3bzt5-11bc                 C   s  zddl }ddl}ddl}W n  tk
r<   td  Y nX tj|}t	d
| |j|}g }i }	|D ]:\}
}t	d
|
| |j||
}||
 ||	|
< qr|D ]}|d}
tdd |
D rt	d	
d|
 |	|d qd
|
d kr*t	d	
d|
 |	|d q| }|	| }|
D ]}|d|rZ|d|}n|g}|d dkrzt|d}nFzt||d }W n2 tk
r   t	d	
d|
 Y q:Y nX t|dkr:t|d }|| }q:|d dkrt|d}|d dkr*t	d
|j|
 ||}z|j|jks>tW n< tk
r| } z| j|j|jf7  _ W 5 d}~X Y nX t	d
|
 t||j|_ |	|d qt	d
d|	!  | S )z- Load tf checkpoints in a pytorch model.
    r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z(Converting TensorFlow checkpoint from {}z"Loading TF weight {} with shape {}/c                 s   s   | ]}|d kV  qdS ))Zadam_vZadam_mZAdamWeightDecayOptimizerZAdamWeightDecayOptimizer_1Zglobal_stepN ).0nr   r   </tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_t5.py	<genexpr>Q   s   z(load_tf_weights_in_t5.<locals>.<genexpr>zSkipping {}Z_slot_z[A-Za-z]+_\d+z_(\d+))kernelZscale	embeddingweight   r   r   z+Transposing numpy weight of shape {} for {}zInitialize PyTorch weight {}z'Weights not copied to PyTorch model: {}z, )"reZnumpyZ
tensorflowImportErrorloggererrorospathabspathinfoformatZtrainZlist_variablesZload_variableappendsplitanyjoinpop	fullmatchgetattrAttributeErrorlenintshape	transposeAssertionErrorargstorchZ
from_numpyZastypeZfloat32datakeys)modelconfigZtf_checkpoint_pathr   nptfZtf_pathZ	init_varsnamesZ
tf_weightsnamer*   arrayZtxt_namepointerZm_nameZscope_namesnumer   r   r   load_tf_weights_in_t54   s|    





r;   c                       s&   e Zd Zd fdd	Zdd Z  ZS )T5LayerNormư>c                    s&   t    tt|| _|| _dS )zg Construct a layernorm module in the T5 style
            No bias and no substraction of mean.
        N)super__init__r   	Parameterr.   onesr   variance_epsilon)selfZhidden_sizeeps	__class__r   r   r?      s    
zT5LayerNorm.__init__c                 C   s2   | djddd}|t|| j  }| j| S )Nr   r   T)Zkeepdim)powmeanr.   sqrtrB   r   )rC   xZvariancer   r   r   forward   s    zT5LayerNorm.forward)r=   __name__
__module____qualname__r?   rK   __classcell__r   r   rE   r   r<      s   r<   c                       s$   e Zd Z fddZdd Z  ZS )T5DenseReluDensec                    sH   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
d S NFbias)r>   r?   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutrC   r2   rE   r   r   r?      s    
zT5DenseReluDense.__init__c                 C   s,   |  |}t|}| |}| |}|S N)rX   FZrelur\   rY   )rC   hidden_stateshr   r   r   rK      s
    



zT5DenseReluDense.forwardrL   r   r   rE   r   rQ      s   rQ   c                       s$   e Zd Z fddZdd Z  ZS )	T5LayerFFc                    s8   t    t|| _t|j|jd| _t	|j
| _d S )NrD   )r>   r?   rQ   DenseReluDenser<   rV   layer_norm_epsilon
layer_normr   rZ   r[   r\   r]   rE   r   r   r?      s    

zT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S r^   )rf   rd   r\   )rC   r`   norm_xylayer_outputr   r   r   rK      s    

zT5LayerFF.forwardrL   r   r   rE   r   rb      s   rb   c                       sL   e Zd Zded fddZdd Zedd
dZdd ZdddZ	  Z
S )T5AttentionFr2   c                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j| _	|j
| _| j	| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrt| j| j	| _t | _d S rR   )r>   r?   
is_decoderhas_relative_attention_biasoutput_attentionsrelative_attention_num_bucketsrV   d_kv	num_headsn_headsr[   r\   	inner_dimr   rU   qkvo	Embeddingrelative_attention_biassetpruned_headsrC   r2   rm   rE   r   r   r?      s"    
zT5Attention.__init__c                    s   t |dkrd S t| j| j}t|| j }|D ](  t fdd| jD 8  d| < q2|d	 
d}tt ||  }t| j|| _t| j|| _t| j|| _t| j|dd| _| jt | | _| j| j | _| j|| _d S )Nr   c                 3   s   | ]}| k rd ndV  qdS )r   r   Nr   )r   ra   headr   r   r      s     z*T5Attention.prune_heads.<locals>.<genexpr>r   r   dim)r(   r.   rA   rr   rp   rz   r{   sumview
contiguouseqarangelongr   rt   ru   rv   rw   rs   union)rC   headsmaskindexr   r}   r   prune_heads   s     
zT5Attention.prune_headsT       c           	      C   s   d}|  }|r:|d }||dk  tj| 7 }t|}nt|t|}|d }||k }|t| | t||  ||   tj }t	|t
||d }|t|||7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention.
        The relative position is defined as memory_position - query_position, i.e.
        the distance in tokens from the attending position to the attended-to
        position.  If bidirectional=False, then positive relative positions are
        invalid.
        We use smaller buckets for small absolute relative_position and larger buckets
        for larger absolute relative_positions.  All relative positions >=max_distance
        map to the same bucket.  All relative positions <=-max_distance map to the
        same bucket.  This should allow for more graceful generalization to longer
        sequences than the model has been trained on.
        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer
        Returns:
            a Tensor with the same shape as relative_position, containing int32
            values in the range [0, num_buckets)
        r   r   r   )tor.   r   absmaxZ
zeros_likelogfloatmathminZ	full_likewhere)	relative_positionbidirectionalnum_bucketsZmax_distanceretr   Z	max_exactZis_smallZval_if_larger   r   r   _relative_position_bucket   s"    &z%T5Attention._relative_position_bucketc                 C   s   t j|t jddddf }t j|t jddddf }|| }| j|| j | jd}|| jjj	}| |}|
dddgd}|S )z' Compute binned relative position bias )ZdtypeN)r   r   r   r   r   )r.   r   r   r   rl   ro   r   ry   r   deviceZpermuteZ	unsqueeze)rC   qlenklenZcontext_positionZmemory_positionr   Z	rp_bucketvaluesr   r   r   compute_bias  s    
zT5Attention.compute_biasNc	                    sp  |  \ }	}
|dk	rfjdks(tdt|dksFtdt||dkr`|	|d jd  n|}n|	}|dkrx|}n
| d} fdd	} fd
d}||}|dkr||}||}n,|dkr| }}||}||}|dk	rH|dkr@|\}}t	j
||gdd}t	j
||gdd}n|\}}jrf|dkrf||ff}nd}t	d||}|dkrڈjstd||}|dk	r|ddddddddf }|dk	r|| }||7 }tj| dd|}tj|jjd}|dk	r || }t	||}||}|}|f| }jrZ||f }jrl||f }|S )zd
        Self-attention (if kv is None) or attention over source sentence (provided by kv).
        NTz*Encoder cannot cache past key value statesr   zSpast_key_value_state should have 2 past states: keys and values. Got {} past statesr   r   c                    s   |   djjddS )z  projection r   r   r   )r   rr   rp   r+   rJ   bsrC   r   r   r*   H  s    z"T5Attention.forward.<locals>.shapec                    s   |  dd  djS )z  compute context r   r   r   )r+   r   r   rs   r   r   r   r   unshapeL  s    z$T5Attention.forward.<locals>.unshaper   r^   zbnqd,bnkd->bnqkzANo position_bias provided and no weights to compute position_biasr   )ptraining)sizerl   r,   r(   r   r*   rt   ru   rv   r.   catZeinsumrm   
ValueErrorr   r_   Zsoftmaxr   Ztype_asr\   r   matmulrw   rn   )rC   inputr   kvposition_biaspast_key_value_state	head_maskquery_length	use_cacher   r   Z	real_qlenr   r*   r   rt   ru   rv   Zk_Zv_present_key_value_stateZscoresweightscontextoutputsr   r   r   rK   %  sp    
 




 





zT5Attention.forward)F)Tr   r   )NNNNNNF)rM   rN   rO   r   r?   r   staticmethodr   r   rK   rP   r   r   rE   r   rj      s   /       rj   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	T5LayerSelfAttentionFc                    s<   t    t||d| _t|j|jd| _t	|j
| _d S Nrm   rc   )r>   r?   rj   SelfAttentionr<   rV   re   rf   r   rZ   r[   r\   r|   rE   r   r   r?     s    
zT5LayerSelfAttention.__init__Nc                 C   sL   |  |}| j||||||d}|d }	|| |	 }
|
f|dd   }|S )N)r   r   r   r   r   r   r   )rf   r   r\   )rC   r`   attention_maskr   r   r   r   rg   attention_outputrh   ri   r   r   r   r   rK     s    	
zT5LayerSelfAttention.forward)F)NNNNFrL   r   r   rE   r   r     s   	     r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	T5LayerCrossAttentionFc                    s<   t    t||d| _t|j|jd| _t	|j
| _d S r   )r>   r?   rj   EncDecAttentionr<   rV   re   rf   r   rZ   r[   r\   r|   rE   r   r   r?     s    
zT5LayerCrossAttention.__init__Nc	              
   C   sP   |  |}	| j|	|||||||d}
|
d }|| | }|f|
dd   }|S )N)r   r   r   r   r   r   r   r   r   )rf   r   r\   )rC   r`   r   r   r   r   r   r   r   rg   r   rh   ri   r   r   r   r   rK     s    

zT5LayerCrossAttention.forward)F)NNNNFNrL   r   r   rE   r   r     s   
      r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	T5BlockFc                    s^   t    |j| _t | _| jt||d | jrJ| jt||d | jt	| d S )Nr   )
r>   r?   rl   r   
ModuleListlayerr    r   r   rb   r|   rE   r   r   r?     s    

zT5Block.__init__Nc
              
   C   sP  |d k	rr| j std|d kr"dnd}
d|
|
dkr8dndt|}t||
ksXt||d d }|dd  }nd\}}| jd ||||||	d	}|d d \}}|dd  }| j r*|d k	r*|d k	r|d jd }nd }| jd
 ||||||||	d}|d }|d k	r||d
  }||dd   }| jd |}|f}||f | }|S )Nz,Only decoder can use `past_key_value_states`r      zcThere should be {} past states. 2 (past / key) for self attention.{} Got {} past key / value statesz"2 (past / key) for cross attention )NNr   )r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )rl   r,   r   r(   r   r*   )rC   r`   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   r   r   Z"expected_num_past_key_value_stateserror_messageZself_attn_past_key_value_stateZcross_attn_past_key_value_stateZself_attention_outputsr   Zattention_outputsr   Zcross_attention_outputsr   r   r   r   rK     sX    

zT5Block.forward)F)NNNNNNNFrL   r   r   rE   r   r     s           r   c                   @   s<   e Zd ZdZeZeZeZ	dZ
edd Zdd Zdd Zd	S )
T5PreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    Ztransformerc                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r.   Ztensorr   r   )rC   r   Z
input_maskdummy_inputsr   r   r   r   .  s    

zT5PreTrainedModel.dummy_inputsc                 C   s  | j j}t|tr(|jj|d  nt|ttfrR|j	jjj
d|d d nXt|tr|jjjj
d|| j jd  d t|jdr|jjdk	r|jjj  |jjjj
d|| j jd  d t|jdr|jjdk	r|jjj  nt|tr| j j}| j j}| j j}|jjjj
d||| d  d |jjjj
d||d  d |jjjj
d||d  d |jjjj
d||| d  d |jr|jjjj
d||d  d dS )z Initialize the weights g      ?g        )rH   Zstd      rT   N)r2   Zinitializer_factor
isinstancer<   r   r/   Zfill_T5ModelT5ForConditionalGenerationsharedZnormal_rQ   rX   rV   hasattrrT   Zzero_rY   rW   rj   rp   rq   rt   ru   rv   rw   rm   ry   )rC   moduleZfactorrV   rp   rr   r   r   r   _init_weights9  s,    

    zT5PreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d k	s td||j}|dd df  |ddd f< ||d< |d k	sdtd||dk| t	|dk
 std	|S )
Nzself.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information.r   r   ).r   z1self.model.config.pad_token_id has to be defined.r   z9Verify that `lm_labels` has only positive values and -100)r2   decoder_start_token_idpad_token_idr,   Z	new_zerosr*   cloneZmasked_fill_r.   allitem)rC   r   r   r   Zshifted_input_idsr   r   r   _shift_rightY  s     zT5PreTrainedModel._shift_rightN)rM   rN   rO   __doc__r   Zconfig_classT5_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapr;   Zload_tf_weightsZbase_model_prefixpropertyr   r   r   r   r   r   r   r   $  s   

 r   c                       s@   e Zd Zd fdd	Zdd Zdd Zdd	 ZdddZ  ZS )T5StackNc                    sv   t     j| _ j| _|| _ j| _t fddt j	D | _
t j jd| _t j| _|   d S )Nc                    s    g | ]}t  t|d kdqS )r   r   )r   bool)r   irk   r   r   
<listcomp>y  s     z$T5Stack.__init__.<locals>.<listcomp>rc   )r>   r?   rn   output_hidden_statesembed_tokensrl   r   r   range
num_layersblockr<   rV   re   final_layer_normrZ   r[   r\   init_weights)rC   r2   r   rE   rk   r   r?   p  s    zT5Stack.__init__c                 C   s   | j S r^   r   rC   r   r   r   get_input_embeddings  s    zT5Stack.get_input_embeddingsc                 C   s   | j S r^   r   r   r   r   r   get_output_embeddings  s    zT5Stack.get_output_embeddingsc                 C   s
   || _ d S r^   r   rC   Znew_embeddingsr   r   r   set_input_embeddings  s    zT5Stack.set_input_embeddingsFc	                 C   s  |d k	r|d k	rt dnT|d k	r<| }	|d|	d }n2|d k	rV| d d }	n| jrft dnt d|d kr| jd k	std| |}|	\}
}|d k	r|dkstd|	|
df|d d jd	 | }n|}|d krt	|
|
|j}| jr0|d kr0|d k	r0|jd }t	|
|
|j}|d krJd gt| j }| ||	| j}| jrx|d k	rx| |}nd }| || jj}d
}d
}d
}d }d }| |}tt| j|D ]\}\}}| jr||f }||||||||| ||d	}|d d	 \}}|dkrJ|| jrdnd	 }| jrJ|d k	rJ|| jrDdnd }||f }| jr||d	 f }q| |}| |}| jr||f }|f}|dkr| jstd| ||f }| jr||f }| jr||f }|S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz5You have to specify either input_ids or inputs_embedsz;You have to intialize the model with valid token embeddingsr   zCInput shape is {}, but should be {} when using past_key_value_satesr   r   r   )r   r   r   r   r   r   r   r      r   Tz@`use_cache` can only be set to `True` if {} is used as a decoder)r   r   r   rl   r   r,   r   r*   r.   rA   r   r   r(   r   Zget_extended_attention_maskZinvert_attention_maskZget_head_maskr2   r   r\   	enumeratezipr   rn   r   )rC   r   r   r   r   inputs_embedsr   past_key_value_statesr   Zinput_shapeZ
batch_sizeZ
seq_lengthZmask_seq_lengthZencoder_seq_lengthZextended_attention_maskZencoder_extended_attention_maskZpresent_key_value_statesZall_hidden_statesZall_attentionsr   r   r`   r   Zlayer_moduler   Zlayer_outputsr   r   r   r   r   rK     s    


 












zT5Stack.forward)N)NNNNNNNF)	rM   rN   rO   r?   r   r   r   rK   rP   r   r   rE   r   r   o  s           r   af      The T5 model was proposed in
    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.

    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
    refer to the PyTorch documentation for all matter related to general usage and behavior.

    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
        https://arxiv.org/abs/1910.10683

    .. _`torch.nn.Module`:
        https://pytorch.org/docs/stable/nn.html#module

    Parameters:
        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a]  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
            T5 is a model with relative position embeddings so you should be able to pad the inputs on both the right and the left.
            Indices can be obtained using :class:`transformers.T5Tokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
            To know more on how to prepare :obj:`input_ids` for pre-training take a look at
            `T5 Training <./t5.html#training>`_ .
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`):
            Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`)
            `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder.
            Used in the cross-attention of the decoder.
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`):
            Provide for sequence to sequence training. T5 uses the pad_token_id as the starting token for decoder_input_ids generation.
            If `decoder_past_key_value_states` is used, optionally only the last `decoder_input_ids` have to be input (see `decoder_past_key_value_states`).
            To know more on how to prepare :obj:`decoder_input_ids` for pre-training take a look at
            `T5 Training <./t5.html#training>`_ .
        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`):
            Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default.
        decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains pre-computed key and value hidden-states of the attention blocks.
            Can be used to speed up decoding.
            If `decoder_past_key_value_states` are used, the user can optionally input only the last `decoder_input_ids`
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
            instead of all `decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
            If `use_cache` is True, `decoder_past_key_value_states` are returned and can be used to speed up decoding (see `decoder_past_key_value_states`).
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation.
            If `decoder_past_key_value_states` is used, optionally only the last `decoder_inputs_embeds` have to be input (see `decoder_past_key_value_states`).
            This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        head_mask: (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
z[The bare T5 Model transformer outputting raw hidden-stateswithout any specific head on top.c                       sV   e Zd Z fddZdd Zdd Zdd Zd	d
 Zdd Ze	e
dddZ  ZS )r   c                    s`   t  | t|j|j| _t|}t	|| j| _
t|}d|_t	|| j| _|   d S )NT)r>   r?   r   rx   
vocab_sizerV   r   copydeepcopyr   encoderrl   decoderr   rC   r2   Zencoder_configZdecoder_configrE   r   r   r?   H  s    

zT5Model.__init__c                 C   s   | j S r^   r   r   r   r   r   r   U  s    zT5Model.get_input_embeddingsc                 C   s"   || _ | j| | j| d S r^   r   r   r   r   r   r   r   r   r   X  s    zT5Model.set_input_embeddingsc                 C   s   | j S r^   r   r   r   r   r   get_encoder]  s    zT5Model.get_encoderc                 C   s   | j S r^   r   r   r   r   r   get_decoder`  s    zT5Model.get_decoderc                 C   s*   |  D ]\}}| jj| j| qdS )z Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        N)itemsr   r   Z	attentionr   )rC   Zheads_to_pruner   r   r   r   r   _prune_headsc  s    zT5Model._prune_headsNTc              
   C   s   |dkr| j ||||
d}|d }|dk	rb|dk	rF|ddddf }|	dk	rb|	ddddf }	| j|||	||||
|d}|dkr||d ff}|dd | |dd  }|| S )	a	  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs.
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
            If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
        decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
            Contains pre-computed key and value hidden-states of the attention blocks.
            Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
            Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

            from transformers import T5Tokenizer, T5Model

            tokenizer = T5Tokenizer.from_pretrained('t5-small')
            model = T5Model.from_pretrained('t5-small')
            input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
            outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
            last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   Tr   r   )r   r   )rC   r   r   encoder_outputsr   r   decoder_past_key_value_statesr   r   decoder_inputs_embedsr   r`   decoder_outputspastr   r   r   rK   k  s6    1   zT5Model.forward)
NNNNNNTNNN)rM   rN   rO   r?   r   r   r   r   r  r	   T5_INPUTS_DOCSTRINGrK   rP   r   r   rE   r   r   C  s$             r   z1T5 Model with a `language modeling` head on top. c                       sf   e Zd Z fddZdd Zdd Zdd Zd	d
 Zdd Ze	e
dddZdd Zdd Z  ZS )r   c                    s~   t  | |j| _t|j|j| _t	|}t
|| j| _t	|}d|_t
|| j| _tj|j|jdd| _|   d S )NTFrS   )r>   r?   rV   	model_dimr   rx   r   r   r   r   r   r   rl   r   rU   lm_headr   r   rE   r   r   r?     s    

z#T5ForConditionalGeneration.__init__c                 C   s   | j S r^   r   r   r   r   r   r     s    z/T5ForConditionalGeneration.get_input_embeddingsc                 C   s"   || _ | j| | j| d S r^   r   r   r   r   r   r     s    z/T5ForConditionalGeneration.set_input_embeddingsc                 C   s   | j S r^   )r  r   r   r   r   r     s    z0T5ForConditionalGeneration.get_output_embeddingsc                 C   s   | j S r^   r   r   r   r   r   r     s    z&T5ForConditionalGeneration.get_encoderc                 C   s   | j S r^   r   r   r   r   r   r     s    z&T5ForConditionalGeneration.get_decoderNTc              
   C   sV  |dkr| j |||	|d}|d }|dk	rD|dkrD|
dkrD| |}|dk	r|dks\td|dk	rx|ddddf }|
dk	r|
ddddf }
| j|||
|||||d}|dkr||d ff}|dd | |d	d  }|d }|| jd
  }| |}|f|dd  }|dk	rNtdd}||d|d|d}|f| }|| S )a  
        lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
                Labels for computing the sequence classification/regression loss.
                Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
                All labels set to ``-100`` are ignored (masked), the loss is only
                computed for labels in ``[0, ..., config.vocab_size]``

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs.
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided):
            Classification loss (cross entropy).
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
            If `past_key_value_states` is used only the last prediction_scores of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
        decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
            Contains pre-computed key and value hidden-states of the attention blocks.
            Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
            Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention.

    Examples::

        from transformers import T5Tokenizer, T5ForConditionalGeneration

        tokenizer = T5Tokenizer.from_pretrained('t5-small')
        model = T5ForConditionalGeneration.from_pretrained('t5-small')
        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)
        loss, prediction_scores = outputs[:2]

        tokenizer = T5Tokenizer.from_pretrained('t5-small')
        model = T5ForConditionalGeneration.from_pretrained('t5-small')
        input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
        outputs = model.generate(input_ids)
        Nr  r   z=Decoder should not use cached key value states when training.r   r  Tr   r   r   r   )Zignore_index)	r   r   r,   r   r
  r  r   r   r   )rC   r   r   r  r   r   r  r   Z	lm_labelsr   r  r   r`   r  r  Zsequence_outputZ	lm_logitsZloss_fctZlossr   r   r   rK     sL    ;   




z"T5ForConditionalGeneration.forwardc                 K   sJ   |d k	st dt|dk r(|d  }}n|d |d  }}|||||dS )Nz*past has to be defined for encoder_outputsr   r   r   )r   r  r  r   r   )r,   r(   )rC   r   r  r   r   kwargsr  r  r   r   r   prepare_inputs_for_generationS  s    z8T5ForConditionalGeneration.prepare_inputs_for_generationc                 C   s   t |dk rtd |S |d }|d f}d}|D ]Z}d}|D ]}||d|f }q@|d j|d jksptt |t |kst||f }q4||f S )Nr   zHYou might want to consider setting `use_cache=True` to speed up decodingr   r   r   )r(   r   warningZindex_selectr*   r,   )rC   r  Zbeam_idxZdecoder_pastZreordered_decoder_pastZlayer_past_statesZreordered_layer_past_statesZlayer_past_stater   r   r   _reorder_cached  s     


z)T5ForConditionalGeneration._reorder_cache)NNNNNNTNNNN)rM   rN   rO   r?   r   r   r   r   r   r	   r	  rK   r  r  rP   r   r   rE   r   r     s*              pr   ))r   r   loggingr   r   r.   Ztorch.nn.functionalr   Z
functionalr_   Ztorch.nnr   Zconfiguration_t5r   Z
file_utilsr   r   r   r	   Zmodeling_utilsr
   r   	getLoggerrM   r   r   r;   Moduler<   rQ   rb   rj   r   r   r   r   r   ZT5_START_DOCSTRINGr	  r   r   r   r   r   r   <module>   sN   
Z S#VK 1w