U
    &cb                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZmZ eeZdd	iZd
d Zdd Zd ddZG dd dejjZdd ZG dd dejjZG dd deZdZdZedeG dd deZedeG dd deZdS )!z PyTorch CTRL model.    N)CrossEntropyLoss   )
CTRLConfig)add_start_docstrings add_start_docstrings_to_callable)Conv1DPreTrainedModelZctrlz?https://storage.googleapis.com/sf-ctrl/pytorch/seqlen256_v1.binc                 C   s$   dt dd|d  |  }| | S )Nr   i'     )torchpow)posid_model_sizeZangle_rates r   >/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_ctrl.py
angle_defn$   s    r   c                 C   sz   t tj| |ddtj||dd|}t|d d dd df }t|d d dd df }tj||gdd}|S )Ndtyper   r   r	   Zdim)r   r
   arange	unsqueezesincoscat)positionr   r   Z
angle_radsZsinesZcosinespos_encodingr   r   r   positional_encoding)   s    r   c              	   C   s   t | |dddd}|jd }|t| }|d k	rn|d|d }	}
|||
|	 |
d |
f d 7 }|d k	r~|| }t j|dd}|d k	r|| }t ||}||fS )	Nr   r      r	   r        r   )r
   matmulpermuteshapenpsqrtsizeZsoftmax)qkvmaskattention_mask	head_maskZ	matmul_qkZdkZscaled_attention_logitsZndnsZattention_weightsoutputr   r   r   scaled_dot_product_attention8   s    
 r/   c                       s0   e Zd Zd	 fdd	Zdd Zd
ddZ  ZS )MultiHeadAttentionFc                    sp   t    || _|| _|| _t|| j | _tj	||| _
tj	||| _tj	||| _tj	||| _d S N)super__init__output_attentions	num_headsr   intdepthr
   nnLinearWqWkWvdense)selfr   r5   r4   	__class__r   r   r3   S   s    
zMultiHeadAttention.__init__c                 C   s&   | |d| j| j}|ddddgS )Nr   r   r	   r   r   )reshaper5   r7   r"   )r>   x
batch_sizer   r   r   split_into_headsa   s    z#MultiHeadAttention.split_into_headsNc	                 C   s  |j d }	| |}| |}| |}| ||	}| ||	}| ||	}|d k	r|d |d  }
}tj|
|fdd}tj||fdd}|dkrt||f}nd}t||||||}|d 	ddddg}|d }|
|	d	| j}| |}||f}| jr
||f }|S )
Nr   r   r   r   Tr1   r	   r   r   )r#   r:   r;   r<   rD   r
   r   stackr/   r"   rA   r   r=   r4   )r>   r)   r(   r'   r*   
layer_pastr+   r,   	use_cacherC   Zpast_keyZ
past_valuepresentr.   Zscaled_attentionattnZoriginal_size_attentionoutputsr   r   r   forwarde   s.    





zMultiHeadAttention.forward)F)NNNF)__name__
__module____qualname__r3   rD   rK   __classcell__r   r   r?   r   r0   R   s   r0   c                 C   s*   t jt j| |t j t j|| S r1   )r
   r8   Z
Sequentialr9   ZReLU)r   dffr   r   r   point_wise_feed_forward_network   s    rQ   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
EncoderLayer皙?Fc                    sh   t    t|||| _t||| _tjj|dd| _	tjj|dd| _
tj|| _tj|| _d S )Ngư>Zeps)r2   r3   r0   multi_head_attentionrQ   ffnr
   r8   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)r>   r   r5   rP   Zrater4   r?   r   r   r3      s    
zEncoderLayer.__init__Nc              
   C   sz   |  |}| j||||||||d}|d }	| |	}	||	 }
| |
}| |}| |}|
| }|f|dd   }|S )NrF   r+   r,   rG   r   r   )rX   rU   r[   rY   rV   r\   )r>   rB   r*   rF   r+   r,   rG   normedZattn_outputsZattn_outputZout1Zout2Z
ffn_outputrJ   r   r   r   rK      s(    





zEncoderLayer.forward)rS   F)NNNF)rL   rM   rN   r3   rK   rO   r   r   r?   r   rR      s   rR   c                   @   s$   e Zd ZdZeZeZdZdd Z	dS )CTRLPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    transformerc                 C   s|   t |tjtjtfrR|jjjd| jj	d t |tjtfrx|j
dk	rx|j
j  n&t |tjrx|j
j  |jjd dS )z! Initialize the weights.
        g        )ZmeanZstdN      ?)
isinstancer8   r9   	Embeddingr   ZweightdataZnormal_configZinitializer_rangebiasZzero_rW   Zfill_)r>   moduler   r   r   _init_weights   s    z!CTRLPreTrainedModel._init_weightsN)
rL   rM   rN   __doc__r   Zconfig_class!CTRL_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixrh   r   r   r   r   r_      s
   r_   am  
    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
            If `past` is used, optionally only the last `input_ids` have to be input (see `past`).

            Indices can be obtained using :class:`transformers.CTRLTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `past` output below). Can be used to speed up sequential decoding.
            If `past` is used, the user can optionally input only the last `input_ids`
            (those that don't have their past given to this model) of shape :obj:`(batch_size, 1)`
            instead of all `input_ids` of shape :obj:`(batch_size, sequence_length)`.
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
            If `past` is used, optionally only the last `token_type_ids` have to be input (see `past`).

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
            If `past` is used, optionally only the last `input_embeds` have to be input (see `past`).
        use_cache (:obj:`bool`):
            If `use_cache` is True, `past` key value states are returned and
            can be used to speed up decoding (see `past`). Defaults to `True`.
z^The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.c                	       sF   e Zd Z fddZdd Zdd Zdd ZeedddZ	  Z
S )	CTRLModelc                    s   t     j| _ j| _ j| _ j| _t j	| jt
j| _t j j| _t j| _t fddt jD | _tj j jd| _|   d S )Nc                    s&   g | ]}t  j j j j jqS r   )rR   n_embdZn_headrP   Zresid_pdropr4   ).0_re   r   r   
<listcomp>  s   z&CTRLModel.__init__.<locals>.<listcomp>rT   )r2   r3   output_hidden_statesr4   rl   r   n_layerZ
num_layersr   Zn_positionsr
   floatr   r8   rc   
vocab_sizewrZ   Z
embd_pdropdropoutZ
ModuleListrangehrW   Zlayer_norm_epsilon	layernorminit_weightsr>   re   r?   ro   r   r3     s    
zCTRLModel.__init__c                 C   s   | j S r1   ru   r>   r   r   r   get_input_embeddings  s    zCTRLModel.get_input_embeddingsc                 C   s
   || _ d S r1   r|   )r>   Znew_embeddingsr   r   r   set_input_embeddings"  s    zCTRLModel.set_input_embeddingsc                 C   s(   |  D ]\}}| j| j| qdS )z~ Prunes heads of the model.
                heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsrx   rI   Zprune_heads)r>   Zheads_to_pruneZlayerZheadsr   r   r   _prune_heads%  s    zCTRLModel._prune_headsNTc	              	      s  |dk	r\|dk	r$|ddddf }|dk	r@|ddddf }|dk	r\|ddddf }|dk	rv|dk	rvt dnX|dk	r| }	|d|	d }|jd }
n,|dk	r| dd }	|jd }
nt d|dkrd}dgt| j }n|d d d}|dkrP|dk	r|jn|j}tj||	d | tj	|d}|
dd|	d }|dk	r|
dksltd||
d}|
d	
d
}|j| jd}d| d }| || jj}|dk	r|d|	d }| |}|t| j9 }nd}|d|	d }|dkr| |}|	d }tt|| || d	|j}|t| j9 }| j|ddf |j}|| | }| |}|	|df }d}d}g }tt| j|D ]x\}\}}| jr||j| f }||||||| |d}|dd
 \}}|dkr||f }| jr||d
  q| |}|j| }| jrH||f }|f}|dkrb||f }| jrt||f }| jr|	dd d |d jdd   t fdd|D }||f }|S )a  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import CTRLTokenizer, CTRLModel
        import torch

        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
        model = CTRLModel.from_pretrained('ctrl')

        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)

        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        Nr   zDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   )r   devicez$batch_size has to be defined and > 0r   r	   r   ra   r    r   r]   T)r   c                 3   s   | ]}|j   V  qd S r1   )view)rm   tZattention_output_shaper   r   	<genexpr>  s     z$CTRLModel.forward.<locals>.<genexpr>) 
ValueErrorr&   r   r#   lenrx   r   r
   r   longr   AssertionErrortor   Zget_head_maskre   rr   ru   r$   r%   r   ZtriuZonesr   rv   	enumerateziprq   r4   appendry   tuple)r>   	input_idspastr+   token_type_idsposition_idsr,   inputs_embedsrG   Zinput_shaperC   Zpast_lengthr   Ztoken_type_embedsZseq_lenr*   Z
pos_embedshidden_statesZoutput_shapeZpresentsZall_hidden_statesZall_attentionsr   rx   rF   rJ   rH   r   r   r   rK   ,  s    1






$








"
zCTRLModel.forward)NNNNNNNT)rL   rM   rN   r3   r~   r   r   r   CTRL_INPUTS_DOCSTRINGrK   rO   r   r   r?   r   rk     s           rk   z~The CTRL Model transformer with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                
       s>   e Zd Z fddZdd Zdd Zeedd	d
Z  Z	S )CTRLLMHeadModelc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NT)rf   )
r2   r3   rk   r`   r8   r9   rl   rt   lm_headrz   r{   r?   r   r   r3     s    
zCTRLLMHeadModel.__init__c                 C   s   | j S r1   )r   r}   r   r   r   get_output_embeddings  s    z%CTRLLMHeadModel.get_output_embeddingsc                 K   s*   |r|d d df  d}|||d dS )Nr   rG   )r   r   rG   )r   )r>   r   r   kwargsr   r   r   prepare_inputs_for_generation  s    z-CTRLLMHeadModel.prepare_inputs_for_generationNTc
              
   C   s   | j ||||||||	d}
|
d }| |}|f|
dd  }|dk	r|dddddf  }|dddf  }t }||d|d|d}|f| }|S )aY
  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored (masked), the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
            Language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import torch
        from transformers import CTRLTokenizer, CTRLLMHeadModel

        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
        model = CTRLLMHeadModel.from_pretrained('ctrl')

        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=input_ids)
        loss, logits = outputs[:2]

        )r   r+   r   r   r,   r   rG   r   r   N.r   )r`   r   
contiguousr   r   r&   )r>   r   r   r+   r   r   r,   r   labelsrG   Ztransformer_outputsr   Z	lm_logitsrJ   Zshift_logitsZshift_labelsZloss_fctZlossr   r   r   rK     s(    7

zCTRLLMHeadModel.forward)	NNNNNNNNT)
rL   rM   rN   r3   r   r   r   r   rK   rO   r   r   r?   r   r     s            r   )NN) ri   loggingZnumpyr$   r
   Ztorch.nnr8   r   Zconfiguration_ctrlr   Z
file_utilsr   r   Zmodeling_utilsr   r   	getLoggerrL   loggerrj   r   r   r/   Moduler0   rQ   rR   r_   ZCTRL_START_DOCSTRINGr   rk   r   r   r   r   r   <module>   s<   

3&2 I