U
    &c                     @   s&  d Z ddlZddlZddlZddlmZmZmZmZ ddl	Z
ddlZddlm  mZ ddlmZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZmZ eeZd
dddddZdZdZ dZ!dd Z"ddej#fddZ$G dd deZ%dd Z&dd Z'dd Z(dGdd Z)G d!d" d"ej*Z+G d#d$ d$ej*Z,G d%d& d&ej*Z-G d'd( d(ej*Z.d)d* Z/G d+d, d,ej*Z0G d-d. d.ej*Z1G d/d0 d0ej2Z3dHd3d4Z4d5d6 Z5ed7d8d9Z6d:d; Z7ed<eG d=d> d>e%Z8ed?ee  G d@dA dAe%Z9edBeG dCdD dDe%Z:G dEdF dFej2Z;dS )Iz1PyTorch BART model, ported from the fairseq repo.    N)DictListOptionalTuple)Tensornn   )ACT2FN)
BartConfig)add_start_docstrings add_start_docstrings_to_callable)PreTrainedModel"create_position_ids_from_input_idsz@https://cdn.huggingface.co/facebook/bart-large/pytorch_model.binzEhttps://cdn.huggingface.co/facebook/bart-large-mnli/pytorch_model.binzDhttps://cdn.huggingface.co/facebook/bart-large-cnn/pytorch_model.binzEhttps://cdn.huggingface.co/facebook/bart-large-xsum/pytorch_model.binzGhttps://cdn.huggingface.co/facebook/mbart-large-en-ro/pytorch_model.bin)z
bart-largezbart-large-mnlizbart-large-cnnzbart-large-xsumzmbart-large-en-roal  

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use it as a regular PyTorch Module and
    refer to the PyTorch documentation for all matters related to general usage and behavior.

    Parameters:
        config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.

a  
    Examples::

        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        # see ``examples/summarization/bart/evaluate_cnn.py`` for a longer example
        model = BartForConditionalGeneration.from_pretrained('bart-large-cnn')
        tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')
        ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
        inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
        # Generate Summary
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
        print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

a  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
               Indices of input sequence tokens in the vocabulary. Use BartTokenizer.encode to produce them.
            Padding will be ignored by default should you provide it.
            Indices can be obtained using :class:`transformers.BartTokenizer.encode(text)`.
        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices in input_ids.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`):
            Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`)
            `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder.
            Used in the cross-attention of the decoder.
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`):
            Provide for translation and summarization training. By default, the model will create this tensor by shifting the input_ids right, following the paper.
        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`):
            Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default.
            If you want to change padding behavior, you should read :func:`~transformers.modeling_bart._prepare_decoder_inputs` and modify.
            See diagram 1 in the paper for more info on the default strategy
c                 C   s   |   dkst| dS )N   r   )dimAssertionErroreq)attention_mask r   >/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_bart.pyinvert_mask\   s    r   c           	      C   sn   | j }|dkrt||}| \}}|dkr8t||}nt|}ttt||dj	||j
d}|||fS )zPrepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if
    none are provided. This mimics the default behavior in fairseq. To override it pass in masks.
    Note: this is not called during generation
    Nr   dtypedevice)pad_token_idshift_tokens_rightsizemake_padding_maskr   torchZtriufill_with_neg_infzerostor   )	config	input_idsdecoder_input_idsdecoder_padding_maskcausal_mask_dtyper   bsztgt_lencausal_maskr   r   r   _prepare_bart_decoder_inputsa   s    
 r*   c                   @   s,   e Zd ZeZdZeZdd Ze	dd Z
dS )PretrainedBartModelmodelc                 C   s   | j j}t|tjr>|jjjd|d |jd k	r|jj	  nFt|t
rJn:t|tjr|jjjd|d |jd k	r|jj|j 	  d S )N        )Zmeanstd)r"   Zinit_std
isinstancer   LinearweightdataZnormal_biasZzero_SinusoidalPositionalEmbedding	Embeddingpadding_idx)selfmoduler.   r   r   r   _init_weights{   s    


z!PretrainedBartModel._init_weightsc                 C   sD   | j j}tjdddddgdddd|gg| jd}|||d	}|S )
Nr      
      r         r   )r   r#   )r"   r   r   tensorr   ne)r7   Z	pad_tokenr#   dummy_inputsr   r   r   rB      s    (z PretrainedBartModel.dummy_inputsN)__name__
__module____qualname__r
   Zconfig_classbase_model_prefix!BART_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapr9   propertyrB   r   r   r   r   r+   v   s   r+   c                 C   s,   | j j\}}tj||dd}| j j|j _|S )NFr3   )r1   shaper   r0   r2   )Zemb
vocab_sizeZemb_sizeZ	lin_layerr   r   r   _make_linear_from_emb   s    rL   c                 C   s   | |krt d| |d S )Nzshape mismatch: {} != {})r   format)Zshape_1Zshape2r   r   r   _check_shapes   s    rN   c                 C   sh   |   }| |jddd d}| d| |dddf< | ddddf |ddddf< |S )zXShift input ids one token to the right, and wrap the last non pad token (usually <eos>).r   r   Nr   )clonerA   sum	unsqueezeZgatherZsqueeze)r#   r   Zprev_output_tokensZindex_of_eosr   r   r   r      s
    $r   c                 C   s   |  |}| sd}|S )zTrue for pad tokensN)r   any)r#   r6   Zpadding_maskr   r   r   r      s    
r   c                       s*   e Zd Zed fddZdd Z  ZS )EncoderLayerr"   c                    s   t    |j| _|j| _t| j|j|jd| _|j	| _	t
| j| _|j| _t|j | _|j| _t| j|j| _t|j| j| _t
| j| _d S )N)dropout)super__init__d_model	embed_dimoutput_attentionsSelfAttentionZencoder_attention_headsattention_dropout	self_attnnormalize_before	LayerNormself_attn_layer_normrW   r	   activation_functionactivation_fnactivation_dropoutr   r0   Zencoder_ffn_dimfc1fc2final_layer_normr7   r"   	__class__r   r   rY      s     
  zEncoderLayer.__init__c                 C   s   |}| j r| |}| j|||| jd\}}tj|| j| jd}|| }| j sX| |}|}| j rl| |}| | 	|}tj|| j
| jd}| |}tj|| j| jd}|| }| j s| |}||fS )a  
        Args:
            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
                `(batch, src_len)` where padding elements are indicated by ``1``.
            for t_tgt, t_src is excluded (or masked out), =0 means it is
            included in attention

        Returns:
            encoded output of shape `(seq_len, batch, embed_dim)`
        )querykeykey_padding_maskneed_weightsptraining)r`   rb   r_   r\   FrW   rr   rh   rd   rf   re   rg   )r7   xencoder_padding_maskresidualattn_weightsr   r   r   forward   s0    
   




zEncoderLayer.forwardrC   rD   rE   r
   rY   rx   __classcell__r   r   rj   r   rU      s   rU   c                       s0   e Zd ZdZed fddZdddZ  ZS )	BartEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer
    is a :class:`EncoderLayer`.

    Args:
        config: BartConfig
    rV   c                    s   t     j| _ j| _ j| _ j| _|j} jr@t	
|nd| _|j| _ j| _|| _ jrvt j|| j| _nt j|| j| _t fddt jD | _ jrt|nt | _ jrt jnd | _d S )N      ?c                    s   g | ]}t  qS r   )rU   .0_rV   r   r   
<listcomp>  s     z(BartEncoder.__init__.<locals>.<listcomp>) rX   rY   rW   Zencoder_layerdrop	layerdropr\   output_hidden_statesembedding_dimscale_embeddingmathsqrtembed_scaler6   max_position_embeddingsZmax_source_positionsembed_tokensstatic_position_embeddingsr4   embed_positionsLearnedPositionalEmbeddingr   
ModuleListrangeZencoder_layerslayersnormalize_embeddingra   Identitylayernorm_embeddingr`   rZ   
layer_norm)r7   r"   r   r[   rj   rV   r   rY      s0    
     zBartEncoder.__init__Nc                 C   s  |dk	rt |}| || j }| |}|| }| |}tj|| j| jd}|dd}g g  }}| j	D ]T}| j
r|| tdd}	| jr|	| jk rd}
n|||\}}
| jrl||
 ql| jr| |}| j
r|| dd |D }|dd}|||fS )a  
        Args:
            input_ids (LongTensor): tokens in the source language of shape
                `(batch, src_len)`
            attention_mask (torch.LongTensor): indicating which indices are padding tokens.
        Returns:
            Tuple comprised of:
                - **x** (Tensor): the last encoder layer's output of
                  shape `(src_len, batch, embed_dim)`
                - **encoder_states** (List[Tensor]): all intermediate
                  hidden states of shape `(src_len, batch, embed_dim)`.
                  Only populated if *self.output_hidden_states:* is True.
                - **all_attentions** (List[Tensor]): Attention weights for each layer.
                During training might not be of length n_layers because of layer dropout.
        Nrp   r   r   c                 S   s   g | ]}| d dqS r   r   	transposer~   Zhidden_stater   r   r   r   @  s     z'BartEncoder.forward.<locals>.<listcomp>)r   r   r   r   r   rs   rW   rr   r   r   r   appendrandomuniformr   r\   r   )r7   r#   r   Zinputs_embedsZ	embed_posrt   Zencoder_statesZall_attentionsZencoder_layerdropout_probabilityZattnr   r   r   rx     s2    






zBartEncoder.forward)N)rC   rD   rE   __doc__r
   rY   rx   rz   r   r   rj   r   r{      s    r{   c                       s,   e Zd Zed fddZdddZ  ZS )DecoderLayerrV   c                    s   t    |j| _|j| _t| j|j|jd| _|j	| _	t
|j | _|j| _|j| _t| j| _t| j|j|jdd| _t| j| _t| j|j| _t|j| j| _t| j| _d S )N)r[   	num_headsrW   T)rW   encoder_decoder_attention)rX   rY   rZ   r[   r\   r]   Zdecoder_attention_headsr^   r_   rW   r	   rc   rd   re   r`   ra   rb   encoder_attnencoder_attn_layer_normr   r0   Zdecoder_ffn_dimrf   rg   rh   ri   rj   r   r   rY   G  s.    
  zDecoderLayer.__init__Nc           
      C   sL  |}|d kri }| j r | |}| j|||||| jd\}}tj|| j| jd}|| }| j sh| |}|}| jj| jjkst	| j r| 
|}| j||||d\}}	tj|| j| jd}|| }| j s| 
|}|}| j r| |}| | |}tj|| j| jd}| |}tj|| j| jd}|| }| j sB| |}|||fS )N)rl   rm   layer_statern   	attn_maskro   rp   )rl   rm   rn   r   )r`   rb   r_   r\   rs   rW   rr   r   	cache_keyr   r   rh   rd   rf   re   rg   )
r7   rt   encoder_hidden_statesencoder_attn_maskr   r)   r%   rv   Zself_attn_weightsr   r   r   r   rx   _  sZ    	








zDecoderLayer.forward)NNNNry   r   r   rj   r   r   F  s       r   c                       s4   e Zd ZdZeejd fddZd	ddZ  Z	S )
BartDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer
    is a :class:`DecoderLayer`.
    Args:
        config: BartConfig
        embed_tokens (torch.nn.Embedding): output embedding
    )r"   r   c                    s   t     j| _ j| _ j| _ j| _|j| _ j| _	 j
rLt jnd| _|| _ jrtt j j j| _nt j j| j| _t fddt jD | _ jrt jnt | _ jrt jnd | _d S )Nr|   c                    s   g | ]}t  qS r   )r   r}   rV   r   r   r     s     z(BartDecoder.__init__.<locals>.<listcomp>) rX   rY   r\   r   rW   Zdecoder_layerdropr   r6   r   Zmax_target_positionsr   r   r   rZ   r   r   r   r4   r   r   r   r   r   r   Zdecoder_layersr   r   ra   r   r   Zadd_final_layer_normr   )r7   r"   r   rj   rV   r   rY     s2    
    zBartDecoder.__init__NFc              	   K   s  |dk	rt |}| j||d}	|rJ|ddddf }|	ddddf }	| || j }
|
|	7 }
| |
}
tj|
| j| jd}
|
dd}
|dd}d}d}g }t	| j
D ]\}}| jr||
f7 }tdd}| jr|| jk rq|dk	r|| nd}||
|||||d\}
}}|r&||  | jrL|t| j
d krL| |
}
| jr||f7 }qd	d
 |D }|
dd}
|dd}|r||f|f}nd}|
||t|fS )a  
        Includes several features from "Jointly Learning to Align and
        Translate with Transformer Models" (Garg et al., EMNLP 2019).

        Args:
            input_ids (LongTensor): previous decoder outputs of shape
                `(batch, tgt_len)`, for teacher forcing
            encoder_hidden_states: output from the encoder, used for
                encoder-side attention
            encoder_padding_mask: for ignoring pad tokens
            decoder_cached_states (dict or None): dictionary used for storing state during generation

        Returns:
            tuple:
                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
                - hidden states
                - attentions
        N)	use_cacherP   rp   r   r   r   )r   r%   r   r)   c                 S   s   g | ]}| d dqS r   r   r   r   r   r   r     s     z'BartDecoder.forward.<locals>.<listcomp>)r   r   r   r   r   rs   rW   rr   r   	enumerater   r   r   r   r   r   copyr   lenr\   list)r7   r#   r   ru   r%   decoder_causal_maskdecoder_cached_statesr   unused	positionsrt   Zall_hidden_statesZall_self_attnsZnext_decoder_cacheidxZdecoder_layerr   r   Zlayer_self_attn
layer_pastZ
next_cacher   r   r   rx     sV    

	
zBartDecoder.forward)NF)
rC   rD   rE   r   r
   r   r5   rY   rx   rz   r   r   rj   r   r     s
     r   c                 C   s.   |   D ] \}}|d k	r|d|| |< q| S )Nr   )itemsindex_select)
attn_cacheZ	new_orderkZinput_buffer_kr   r   r   _reorder_buffer  s    r   c                	       s   e Zd ZdZd fdd	Zdd Zdee ee eee	ee f  ee e
eee f d
ddZdd Zeee ee eeeee dddZ  ZS )r]   z=Multi-headed attention from 'Attention Is All You Need' paperr-   TFc                    s   t    || _|| _|| _|| | _| j| | jks>td| jd | _|| _t	j
|||d| _t	j
|||d| _t	j
|||d| _t	j
|||d| _| jrdnd| _d S )Nz(embed_dim must be divisible by num_headsg      rI   Zencoder_decoderr7   )rX   rY   r[   r   rW   head_dimr   scalingr   r   r0   k_projv_projq_projout_projr   )r7   r[   r   rW   r3   r   rj   r   r   rY   '  s    

zSelfAttention.__init__c                 C   s"   |  ||| j | jddS )Nr   r   )
contiguousviewr   r   r   )r7   r@   Zdim_0r'   r   r   r   _shape>  s    zSelfAttention._shapeN)rm   rn   r   r   returnc                 C   s  | j }| \}}	}
|
| jks"tt| ||	|
gks<t|dk	rd|| ji }d|krl|rld}nd}i }| || j }|r|dkrd }}q| 	|}| 
|}n| 	|}| 
|}| |||	}|dk	r| |d|	}|dk	r| |d|	}|dk	r| ||||||	\}}}||	| jd| j||	| jd| j|sH|ndd|| j< |dk	sdt|d}t||dd}| |	| j ||fkst|dk	r||	| j||| }||	| j ||}|dk	r| dkrd}|dks| dd |	|fkst|dk	rh||	| j||}|dd}||td}||	| j ||}tj|dd	}tj|| j| jd
}|dk	stt||}| |	| j || jfkst|dd ||	|
}| |}|r||	| j||}nd}||fS )z+Input shape: Time(SeqLen) x Batch x ChannelNprev_keyrP   )r   
prev_valueprev_key_padding_maskr   r   r   -infrO   rp   )r   r   r[   r   r   getr   r   r   r   r   r   _use_saved_stater   r   r   r   Zbmmr   r   rS   Zmasked_fillfloatrs   ZsoftmaxrW   rr   r   r   )r7   rl   rm   rn   r   r   ro   	static_kvr(   r'   r[   saved_stateqr   vsrc_lenrw   ZreshapedZ
attn_probsZattn_outputr   r   r   rx   A  sp    







(
 
zSelfAttention.forwardc                 C   s   d|krZ|d }|d k	st ||| j d| j}|r<|}n|d k	sHt tj||gdd}d|kr|d }	|	d k	svt |	|| j d| j}
|r|
}n|d k	st tj|
|gdd}|d k	r|d k	st |dd }| ||||d|}|||fS )Nr   rP   r   rO   r   r   )	r   r   r   r   r   catr   _cat_prev_key_padding_maskr   )r7   r   r   r   rn   r   r'   Z	_prev_keyr   Z_prev_valuer   r   r   r   r   r     s4        zSelfAttention._use_saved_state)rn   r   
batch_sizer   r   r   c                 C   sj   |d k	r&|r|}qft j|| gdd}n@| d k	rbt j||| d | j| jd}t j|| gdd}n|}|S )Nr   rO   r   )r   r   r    r   r   r   )rn   r   r   r   r   Znew_key_padding_maskZfillerr   r   r   r     s    	z(SelfAttention._cat_prev_key_padding_mask)r-   TF)NNNF)rC   rD   rE   r   rY   r   r   r   r   strr   rx   r   staticmethodintboolr   rz   r   r   rj   r   r]   $  s4          Ur]   c                       s(   e Zd ZdZ fddZdd Z  ZS )BartClassificationHeadz-Head for sentence-level classification tasks.c                    s8   t    t||| _tj|d| _t||| _d S )N)rq   )rX   rY   r   r0   denseZDropoutrW   r   )r7   Z	input_dimZ	inner_dimZnum_classesZpooler_dropoutrj   r   r   rY     s    
zBartClassificationHead.__init__c                 C   s6   |  |}| |}t|}|  |}| |}|S N)rW   r   r   tanhr   )r7   rt   r   r   r   rx     s    




zBartClassificationHead.forward)rC   rD   rE   r   rY   rx   rz   r   r   rj   r   r     s   r   c                       s8   e Zd ZdZeeed fddZd fdd	Z  ZS )	r   a  
    This module learns positional embeddings up to a fixed maximum size.
    Padding ids are ignored by either offsetting based on padding_idx
    or by setting padding_idx to None and ensuring that the appropriate
    position ids are passed to the forward function.
    )num_embeddingsr   r6   c                    s.   |d k	st ||d 7 }t j|||d d S )Nr   )r6   )r   rX   rY   )r7   r   r   r6   rj   r   r   rY     s    z#LearnedPositionalEmbedding.__init__Fc                    sF   |r.t | j|d }|jdd|}nt|| j}t |S )/Input is expected to be of size [bsz x seqlen].r   )	r   r6   r   r2   newfill_r   rX   rx   )r7   inputr   posr   rj   r   r   rx     s
    z"LearnedPositionalEmbedding.forward)F)rC   rD   rE   r   r   rY   rx   rz   r   r   rj   r   r     s     	r   h㈵>Tc                 C   sJ   t j r:zddlm} || ||W S  tk
r8   Y nX t j| ||S )Nr   )FusedLayerNorm)r   ZcudaZis_availableZapex.normalizationr   ImportErrorr   ra   )Znormalized_shapeZepsZelementwise_affiner   r   r   r   ra     s    
ra   c                 C   s   |   t d| S )z:FP16-compatible function that fills a input_ids with -inf.r   )r   r   Ztype_astr   r   r   r   	  s    r   r   c                 C   s   t dd | D S )z4Remove entries that are None or [] from an iterable.c                 s   s"   | ]}t |tjs|r|V  qd S r   )r/   r   r   r~   rt   r   r   r   	<genexpr>  s       z,_filter_out_falsey_values.<locals>.<genexpr>)tuple)tupr   r   r   _filter_out_falsey_values  s    r   c                 C   s   t | dd S )NrJ   )getattrr   r   r   r   
_get_shape  s    r   zRThe bare BART Model outputting raw hidden-states without any specific head on top.c                       sV   e Zd Zed fddZeedee dddZ	d	d
 Z
dd Zdd Z  ZS )	BartModelrV   c                    sd   t  | |j| _|j| _|j|j }}t||j|| _	t
|| j	| _t|| j	| _|   d S r   )rX   rY   r\   r   r   rK   r   r5   rZ   sharedr{   encoderr   decoderZinit_weights)r7   r"   r6   rK   rj   r   r   rY     s    zBartModel.__init__NF)encoder_outputsc              	   C   s   |s&t | j|||| jjjd\}}}	nd\}}	|d k	s:t|d krP| j||d}t|ts^t| j	||d |||	||d}
t
|
}
t|
d tjstt
|}|
| S )N)r$   r%   r&   )NN)r#   r   r   )r   r   r   )r*   r"   r   r1   r   r   r   r/   r   r   r   r   r   )r7   r#   r   r$   r   decoder_attention_maskr   r   r%   r)   Zdecoder_outputsr   r   r   rx   )  s4    
zBartModel.forwardc                 C   s   | j S r   )r   r7   r   r   r   get_input_embeddingsU  s    zBartModel.get_input_embeddingsc                 C   s   || _ | j | j_| j | j_d S r   )r   r   r   r   )r7   valuer   r   r   set_input_embeddingsX  s    
zBartModel.set_input_embeddingsc                 C   s
   t | jS r   )rL   r   r   r   r   r   get_output_embeddings]  s    zBartModel.get_output_embeddings)NNNNNF)rC   rD   rE   r
   rY   r   BART_INPUTS_DOCSTRINGr   r   rx   r   r   r   rz   r   r   rj   r   r     s         +r   zLThe BART Model with a language modeling head. Can be used for summarization.c                       s   e Zd ZdZed fddZeejd fddZ	eedd	d
dZ
eedddZdd Zdd ZddddZedd Zdd Zdd Z  ZS )BartForConditionalGenerationr,   rV   c                    s:   t  | t|}|| _| dtd| jjjf d S )Nfinal_logits_biasr   )	rX   rY   r   r,   register_bufferr   r    r   r   )r7   r"   Z
base_modelrj   r   r   rY   h  s    z%BartForConditionalGeneration.__init__)new_num_tokensr   c                    s.   | j jj}t |}|| j _| || |S r   )r,   r   r   rX   resize_token_embeddings_resize_final_logits_bias)r7   r   old_num_tokensZnew_embeddingsrj   r   r   r   n  s
    
z4BartForConditionalGeneration.resize_token_embeddingsN)r   r   r   c                 C   s^   ||kr | j d d d |f }n.tjd|| f| j jd}tj| j |gdd}| d| d S )Nr   r?   rO   r   )r   r   r    r   r   r   )r7   r   r   Znew_biasZ
extra_biasr   r   r   r   u  s
    z6BartForConditionalGeneration._resize_final_logits_biasFc	              	   K   s   | j |||||||d}
tj|
d | j jj| jd}|f|
dd  }
|dk	r|t }||d| j	j
|d}|f|
 }
|
S )a
  
        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the masked language modeling loss.
            Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
            with labels
            in ``[0, ..., config.vocab_size]``.

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Masked language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

            # Mask filling only works for bart-large
            from transformers import BartTokenizer, BartForConditionalGeneration
            tokenizer = BartTokenizer.from_pretrained('bart-large')
            TXT = "My friends are <mask> but they eat too many carbs."
            model = BartForConditionalGeneration.from_pretrained('bart-large')
            input_ids = tokenizer.batch_encode_plus([TXT], return_tensors='pt')['input_ids']
            logits = model(input_ids)[0]
            masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
            probs = logits[0, masked_index].softmax(dim=0)
            values, predictions = probs.topk(5)
            tokenizer.decode(predictions).split()
            # ['good', 'great', 'all', 'really', 'very']
        )r   r$   r   r   r   r   r   rI   r   NrP   )r,   rs   Zlinearr   r1   r   r   ZCrossEntropyLossr   r"   rK   )r7   r#   r   r   r$   r   r   Z	lm_labelsr   r   outputsZ	lm_logitsZloss_fctZmasked_lm_lossr   r   r   rx   }  s     6	
z$BartForConditionalGeneration.forwardc                 K   s>   |d k	st d|d s$|d  }}n|\}}d |||||dS )Nz*past has to be defined for encoder_outputsr   )r#   r   r   r$   r   r   )r   )r7   r$   pastr   r   kwargsr   r   r   r   r   prepare_inputs_for_generation  s    z:BartForConditionalGeneration.prepare_inputs_for_generationc                 C   sD   |dkr|  || jj ||d kr@| jjd k	r@|  || jj |S )Nr   )_force_token_ids_generationr"   Zbos_token_ideos_token_id)r7   logitscur_len
max_lengthr   r   r   prepare_logits_for_generation  s
    z:BartForConditionalGeneration.prepare_logits_for_generationr   c                    sr   t  tr g tj fddt| jjD tjt| 	 j
d}t|jdksXtdtd |dd|f< dS )zOforce one of token_ids to be generated by setting prob of all other tokens to 0c                    s   g | ]}| kr|qS r   r   r   	token_idsr   r   r     s      zLBartForConditionalGeneration._force_token_ids_generation.<locals>.<listcomp>r   r   z?scores should be of rank 2 with shape: [batch_size, vocab_size]infN)r/   r   r   r@   r   r"   rK   longnext
parametersr   r   rJ   r   r   )r7   Zscoresr
  Zall_but_token_ids_maskr   r	  r   r    s    
z8BartForConditionalGeneration._force_token_ids_generationc           
         sz   | \\}}}g }|D ]$} fdd|  D }|| q|d krF|n
|d }|d kr^|n
|d }	||	f|f} | S )Nc                    s   i | ]\}}|t | qS r   )r   )r~   Zattn_keyr   beam_idxr   r   
<dictcomp>  s     z?BartForConditionalGeneration._reorder_cache.<locals>.<dictcomp>r   )r   r   r   )
r   r  Zenc_outZenc_maskr   Zreordered_pastr   Zlayer_past_newZnew_enc_outZnew_enc_maskr   r  r   _reorder_cache  s    
z+BartForConditionalGeneration._reorder_cachec                 C   s   | j jS r   )r,   r   r   r   r   r   get_encoder  s    z(BartForConditionalGeneration.get_encoderc                 C   s   t | jjS r   )rL   r,   r   r   r   r   r   r     s    z2BartForConditionalGeneration.get_output_embeddings)NNNNNNF)rC   rD   rE   rF   r
   rY   r   r   r5   r   r   r   r   rx   r  r  r  r   r  r  r   rz   r   r   rj   r   r   a  s(          H
r   zxBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. c                       s4   e Zd Zed fddZeedddZ  ZS )BartForSequenceClassificationrV   c                    sV   t  j|f| t|| _t|j|j|j|j| _| j	| jj
 | j	| jj d S r   )rX   rY   r   r,   r   rZ   
num_labelsZclassif_dropoutclassification_headr9   r   r   )r7   r"   r  rj   r   r   rY     s    
   z&BartForSequenceClassification.__init__Nc                 C   s   | j |||||d}|d }|| jj}	tt|	ddkrJtd||	ddf 	|
dd|
ddddddf }
| |
}|f|dd  }|dk	rt|	d| jj|	d}|f| }|S )a  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BartConfig`) and inputs:
            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
                Classification loss (cross entropy)
            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
                Classification (or regression if config.num_labels==1) scores (before SoftMax).
            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
                Attentions weights after the attention softmax, used to compute the weighted average in the
                self-attention
                heads.

    Examples::

        from transformers import BartTokenizer, BartForSequenceClassification
        import torch

        tokenizer = BartTokenizer.from_pretrained('bart-large')
        model = BartForSequenceClassification.from_pretrained('bart-large')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute",
        add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

        )r   r$   r   r   r   r   z7All examples must have the same number of <eos> tokens.NrP   )r,   r   r"   r  r   r   uniquerR   
ValueErrorr   r   r  rs   Zcross_entropyr  )r7   r#   r   r   r$   r   labelsr   rt   Zeos_maskZsentence_representationr  Zlossr   r   r   rx     s$    .8

z%BartForSequenceClassification.forward)NNNNN)	rC   rD   rE   r
   rY   r   r   rx   rz   r   r   rj   r   r    s   	     r  c                       sL   e Zd ZdZd fdd	ZeejdddZe	
 d fd	d
	Z  ZS )r4   zDThis module produces sinusoidal positional embeddings of any length.Nc                    s<   t  || |d dkr*td| d| | j| _d S )Nr   r   zodd embedding_dim z not supported)rX   rY   NotImplementedError_init_weightr1   )r7   Znum_positionsr   r6   rj   r   r   rY   W  s    z&SinusoidalPositionalEmbedding.__init__)outc              	      s   | j \} t fddt|D }tt|dddddf | ddd d f< tt|dddddf | dd d df< |   d| _	| S )zIdentical to the XLM create_sinusoidal_embeddings except features are not interleaved.
            The cos features are in the 2nd half of the vector. [dim // 2:]
        c                    s$   g | ]  fd dt D qS )c              	      s(   g | ] }t d d|d     qS )i'  r   )nppower)r~   j)r   r   r   r   r   d  s     zISinusoidalPositionalEmbedding._init_weight.<locals>.<listcomp>.<listcomp>)r   )r~   rO   )r   r   r   d  s     z>SinusoidalPositionalEmbedding._init_weight.<locals>.<listcomp>Nr   r   r   F)
rJ   r  arrayr   r   ZFloatTensorsincosZdetach_Zrequires_grad)r  Zn_posZposition_encr   rO   r   r  ]  s    
66z*SinusoidalPositionalEmbedding._init_weightFc                    sR   |j dd \}}|r0|jdd|d }ntj|tj| jjd}t	 
|S )r   Nr   r   r   )rJ   r2   r   r   r   Zaranger  r1   r   rX   rx   )r7   r#   r   r'   Zseq_lenr   rj   r   r   rx   l  s
    z%SinusoidalPositionalEmbedding.forward)N)F)rC   rD   rE   r   rY   r   r   	Parameterr  r   Zno_gradrx   rz   r   r   rj   r   r4   T  s   r4   )r   )r   T)<r   loggingr   r   typingr   r   r   r   Znumpyr  r   Ztorch.nn.functionalr   Z
functionalrs   r   Zactivationsr	   Zconfiguration_bartr
   Z
file_utilsr   r   Zmodeling_utilsr   r   	getLoggerrC   loggerrG   ZBART_START_DOCSTRINGZBART_GENERATION_EXAMPLEr   r   Zfloat32r*   r+   rL   rN   r   r   ModulerU   r{   r   r   r   r]   r   r5   r   ra   r   r   r   r   r   r  r4   r   r   r   r   <module>   s|   
  
	
5]Y~ *
 F N