U
    &cY                    @   s  d dl Z d dlZd dlZd dlmZmZ d dlZd dlmZmZm	Z	m
Z
 d dlmZ d dlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZ eeZzd d
lmZ W n& ek
r   G dd de
j ZY nX G dd dZ!G dd de
j e!Z"ee#e#e#ddddZ$dd Z%d de&d dfddZ'G dd de(Z)G dd de
j Z*G dd de
j Z+G d d! d!e
j Z,G d"d# d#e
j Z-G d$d% d%e
j Z.G d&d' d'e
j Z/d(d) Z0d4d*d+Z1d5d,d-Z2d6d.d/Z3e#e#ed0ejf ejd1d2d3Z4dS )7    N)CallableTuple)Tensordevicedtypenn)CrossEntropyLoss)
functional   )get_activation)PretrainedConfig)DUMMY_INPUTSTF2_WEIGHTS_NAMETF_WEIGHTS_NAMEWEIGHTS_NAMEcached_pathhf_bucket_urlis_remote_url)Identityc                       s(   e Zd ZdZ fddZdd Z  ZS )r   zFA placeholder identity operator that is argument-insensitive.
        c                    s   t    d S N)super__init__)selfargskwargs	__class__ ?/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_utils.pyr   3   s    zIdentity.__init__c                 C   s   |S r   r   )r   inputr   r   r   forward6   s    zIdentity.forward__name__
__module____qualname____doc__r   r    __classcell__r   r   r   r   r   /   s   r   c                   @   s   e Zd ZdZdeedddZedd Zedd	 Z	d
d Z
dd ZeedddZeedddZeedddZeeedddZdddZdd ZdS ) ModuleUtilsMixinzF
    A few utilities for torch.nn.Modules, to be used as a mixin.
    F)only_trainablereturnc                 C   s0   |rt dd |  n|  }tdd |D S )zQ
        Get number of (optionally, trainable) parameters in the module.
        c                 S   s   | j S r   )requires_gradxr   r   r   <lambda>C       z1ModuleUtilsMixin.num_parameters.<locals>.<lambda>c                 s   s   | ]}|  V  qd S r   )Znumel).0pr   r   r   	<genexpr>D   s     z2ModuleUtilsMixin.num_parameters.<locals>.<genexpr>)filter
parameterssum)r   r(   paramsr   r   r   num_parameters?   s    zModuleUtilsMixin.num_parametersc                 O   sL   zdd l }W n tk
r(   tdY nX |t }| }|j| _d S )Nr   FYou need to install psutil (pip install psutil) to use memory tracing.)psutilImportErrorProcessosgetpidmemory_inforssmem_rss_pre_forward)moduler   r   r8   processmemr   r   r   _hook_rss_memory_pre_forwardF   s    z-ModuleUtilsMixin._hook_rss_memory_pre_forwardc                 O   sr   zdd l }W n tk
r(   tdY nX |t }| }|j| _| j| j }|t	| drf| j
nd | _
d S )Nr   r7   mem_rss_diff)r8   r9   r:   r;   r<   r=   r>   mem_rss_post_forwardr?   hasattrrD   )r@   r   r   r8   rA   rB   rD   r   r   r   _hook_rss_memory_post_forwardR   s    z.ModuleUtilsMixin._hook_rss_memory_post_forwardc                 C   s2   |   D ]}|| j || j q|   dS )a   Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.
            Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero with `model.reset_memory_hooks_state()`
        N)modulesZregister_forward_pre_hookrC   Zregister_forward_hookrG   reset_memory_hooks_stater   r@   r   r   r   add_memory_hooks`   s    z!ModuleUtilsMixin.add_memory_hooksc                 C   s$   |   D ]}d|_d|_d|_qd S Nr   )rH   rD   rE   r?   rJ   r   r   r   rI   i   s    z)ModuleUtilsMixin.reset_memory_hooks_state)r)   c                 C   s   t |  jS r   )nextr3   r   r   r   r   r   r   o   s    zModuleUtilsMixin.devicec                 C   s   t |  jS r   )rM   r3   r   rN   r   r   r   r   s   s    zModuleUtilsMixin.dtype)encoder_attention_maskr)   c                 C   sj   |  dkr(|dddddddf }|  dkrL|ddddddf }|j| jd}d| d }|S )z"type: torch.Tensor -> torch.Tensor   N   r         ?    e)dimtor   )r   rO   Zencoder_extended_attention_maskr   r   r   invert_attention_maskw   s    z&ModuleUtilsMixin.invert_attention_mask)attention_maskinput_shaper   c           	      C   s  |  dkr*|dddddddf }n|  dkr| jjr|\}}tj||d}|ddddf ||d|ddddf k}||j}|dddddddf |ddddddf  }q|ddddddf }ntd	||j
|j| jd}d| d	 }|S )
a  Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored.

        Arguments:
            attention_mask: torch.Tensor with 1 indicating tokens to ATTEND to
            input_shape: tuple, shape of input_ids
            device: torch.Device, usually self.device

        Returns:
            torch.Tensor with dtype of attention_mask.dtype
        rP   NrQ   )r   r
   zAWrong shape for input_ids (shape {}) or attention_mask (shape {})rR   rS   g     )rU   configZ
is_decodertorcharangerepeatrV   r   
ValueErrorformatshape)	r   rX   rY   r   Zextended_attention_mask
batch_sizeZ
seq_lengthZseq_idsZcausal_maskr   r   r   get_extended_attention_mask   s&    .6 z,ModuleUtilsMixin.get_extended_attention_maskc                 C   s6   |dk	r(|  ||}|dkr2|d}n
dg| }|S )a  
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        attention_probs has shape bsz x n_heads x N x N
        Arguments:
            head_mask: torch.Tensor or None: has shape [num_heads] or [num_hidden_layers x num_heads]
            num_hidden_layers: int
        Returns:
             Tensor of shape shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
             or list with [None] for each layer
        NT)_convert_head_mask_to_5d	unsqueeze)r   	head_masknum_hidden_layersZis_attention_chunkedr   r   r   get_head_mask   s    
zModuleUtilsMixin.get_head_maskc                 C   s   |  dkr<|dddd}||dddd}n"|  dkr^|ddd}|  dks|td|   |j| jd}|S )zD-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]r
   r   rc   rQ      zhead_mask.dim != 5, instead rR   )rU   re   expandAssertionErrorrV   r   )r   rf   rg   r   r   r   rd      s    z)ModuleUtilsMixin._convert_head_mask_to_5dN)F)F)r"   r#   r$   r%   boolintr6   staticmethodrC   rG   rK   rI   propertyr   r   r   rW   tuplerb   rh   rd   r   r   r   r   r'   :   s    

	,
r'   c                       s  e Zd ZdZdZi ZdZedd Z fddZ	edd	 Z
d
d Zdd Zdd Zdd Zdd Zd3ddZdd Zd4ddZdd Zdd Zdd Zed d! Zd"d# Zd$d% Zd&d' Zd(d) Ze d5d*d+Zd,d- Zd.d/ Z e!e"e#e"e# d0d1d2Z$  Z%S )6PreTrainedModela/   Base class for all models.

        :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.

        Class attributes (overridden by derived classes):
            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:

                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
                - ``path``: a path (string) to the TensorFlow checkpoint.

            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
    N c                 C   s   dt tiS )zx Dummy inputs to do a forward pass in the network.

        Returns:
            torch.Tensor with dummy inputs
        	input_ids)r[   Ztensorr   rN   r   r   r   dummy_inputs   s    zPreTrainedModel.dummy_inputsc                    s6   t    t|ts,td| jj| jj|| _d S )NzParameter config in `{}(config)` should be an instance of class `PretrainedConfig`. To create a model from a pretrained model use `model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`)	r   r   
isinstancer   r^   r_   r   r"   rZ   )r   rZ   inputsr   r   r   r   r      s    

 zPreTrainedModel.__init__c                 C   s   t | | j| S r   )getattrbase_model_prefixrN   r   r   r   
base_model   s    zPreTrainedModel.base_modelc                 C   s&   t | | j| }|| k	r| S tdS )z
        Returns the model's input embeddings.

        Returns:
            :obj:`nn.Module`:
                A torch module mapping vocabulary to hidden states.
        N)rw   rx   get_input_embeddingsNotImplementedError)r   ry   r   r   r   rz     s    z$PreTrainedModel.get_input_embeddingsc                 C   s*   t | | j| }|| k	r"|| ntdS )z
        Set model's input embeddings

        Args:
            value (:obj:`nn.Module`):
                A module mapping vocabulary to hidden states.
        N)rw   rx   set_input_embeddingsr{   )r   valuery   r   r   r   r|     s    z$PreTrainedModel.set_input_embeddingsc                 C   s   dS )z
        Returns the model's output embeddings.

        Returns:
            :obj:`nn.Module`:
                A torch module mapping hidden states to vocabulary.
        Nr   rN   r   r   r   get_output_embeddings  s    z%PreTrainedModel.get_output_embeddingsc                 C   s$   |   }|dk	r | ||   dS )z
        Tie the weights between the input embeddings and the output embeddings.
        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
        the weights instead.
        N)r~   _tie_or_clone_weightsrz   )r   output_embeddingsr   r   r   tie_weights(  s    zPreTrainedModel.tie_weightsc                 C   s   | j jrt|j |_n|j|_t|dddk	rhtjj	|j
jd|jjd |j
jd  fdd|j
_t|drt|dr|j|_dS )zZ Tie or clone module weights depending of whether we are using TorchScript or not
        biasNr   Zconstantout_featuresnum_embeddings)rZ   Ztorchscriptr   	Parameterweightclonerw   r[   r	   padr   datar`   rF   r   r   )r   r   Zinput_embeddingsr   r   r   r   2  s    z%PreTrainedModel._tie_or_clone_weightsc                 C   s>   t | | j| }||}|dkr$|S || j_||_|   |S )a   Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:

            new_num_tokens: (`optional`) int:
                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
                If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.

        Return: ``torch.nn.Embeddings``
            Pointer to the input tokens Embeddings Module of the model
        N)rw   rx   _resize_token_embeddingsrZ   
vocab_sizer   )r   new_num_tokensry   Zmodel_embedsr   r   r   resize_token_embeddingsD  s    
z'PreTrainedModel.resize_token_embeddingsc                 C   s&   |   }| ||}| | |   S r   )rz   _get_resized_embeddingsr|   )r   r   old_embeddingsnew_embeddingsr   r   r   r   _  s    
z(PreTrainedModel._resize_token_embeddingsc                 C   s   |dkr|S |j  \}}||kr&|S t||}||j j | | t||}|j jd|ddf |j jd|ddf< |S )a   Build a resized Embedding Module from a provided token Embedding Module.
            Increasing the size will add newly initialized vectors at the end
            Reducing the size will remove vectors from the end

        Args:
            new_num_tokens: (`optional`) int
                New number of tokens in the embedding matrix.
                Increasing the size will add newly initialized vectors at the end
                Reducing the size will remove vectors from the end
                If not provided or None: return the provided token Embedding Module.
        Return: ``torch.nn.Embeddings``
            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
        N)	r   sizer   Z	EmbeddingrV   r   _init_weightsminr   )r   r   r   Zold_num_tokensZold_embedding_dimr   Znum_tokens_to_copyr   r   r   r   e  s    

,z'PreTrainedModel._get_resized_embeddingsc                 C   s.   |  | j | jjr"| | jj |   dS )z* Initialize and prunes weights if needed. N)applyr   rZ   pruned_headsprune_headsr   rN   r   r   r   init_weights  s    zPreTrainedModel.init_weightsc                 C   sN   |  D ]4\}}t| jj|g t|B }t|| jj|< q| j| dS )aZ   Prunes heads of the base model.

            Arguments:

                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
                E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
        N)itemssetrZ   r   getlistry   Z_prune_heads)r   Zheads_to_prunelayerZheadsZunion_headsr   r   r   r     s    	zPreTrainedModel.prune_headsc                 C   s   t j|stdt| dr$| jn| }|jjg|j_	t j
|t}t| jddrddlm  m} | rx|j| || | n|j| t| | td| dS )a   Save a model and its configuration file to a directory, so that it
            can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.

            Arguments:
                save_directory: directory to which to save.
        zPSaving path should be a directory where the model and configuration can be savedr@   
xla_deviceFr   NzModel weights saved in {})r;   pathisdirrk   rF   r@   r   r"   rZ   Zarchitecturesjoinr   rw   torch_xla.core.xla_modelcore	xla_modelZis_master_ordinalsave_pretrainedsave
state_dictr[   loggerinfor_   )r   Zsave_directoryZmodel_to_saveZoutput_model_filexmr   r   r   r     s     zPreTrainedModel.save_pretrainedc                     s  | dd}| dd| dd}| dd}| dd}| dd}| d	d}	| d
d}
| dd}| dd}t|ts|dk	r|n|} jj|f||d|||	|d|\}}n|}|dk	r| jkr j| }n tj|r|r(tj	tj
|td r(tj
|td }np|rVtj	tj
|trVtj
|t}nBtj	tj
|tr~tj
|t}ntdtttd g|njtj	|st|r|}nLtj	|d r|std|d |d }nt||rtnt|d}zt||||	||d}W nX tk
rt   | jkrFd|}n"d|d
 j |tttg}t|Y nX ||krtd| ntd|| nd} |f||}dkr|sztj|ddW n tk
r   tdY nX g g g |rx|dr2 |||dd }nBzddlm} |||dd}W n" tk
rr   t d   Y nX ng }g } D ]P}d}d!|kr|!d!d"}d#|kr|!d#d$}|r|"| |"| qt#||D ]\}} ||< qt$d%d% dk	r"_&d6t'j(d'fd(d)d&}|}t) fd*d+ D }t*| j+s|r j+d, }t*| j+r|st$| j+}||d- |j,j-|j,j-kr|.  } fd.d/|.  D }/||  t0dkrtd0|j,j- t0dkr<td1|j,j- t0dkrdt1d2|j,j-d3
|2  |3  |
rd4}||fS t*|d5r|j4rddl5m6  m7} |8||4 }|9|4 }|S )7a  Instantiate a pretrained pytorch model from a pre-trained model configuration.

        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with ``model.train()``

        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
        It is up to you to train those weights with a downstream fine-tuning task.

        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.

        Parameters:
            pretrained_model_name_or_path: either:
              - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
              - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
              - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
              - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
              - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)

            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method

            config: (`optional`) one of:
                - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
                - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                    - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                    - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                    - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.

            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.

            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.

            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.

            resume_download: (`optional`) boolean, default False:
                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.

            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.

            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.

            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:

                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.

        Examples::

            # For example purposes. Not runnable.
            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        rZ   Nr   	cache_dirfrom_tfFforce_downloadresume_downloadproxiesoutput_loading_infolocal_files_onlyuse_cdnT)r   Zreturn_unused_kwargsr   r   r   r   z.indexzFError no file named {} found in directory {} or `from_tf` set to Falsez_We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint)filenamer   )r   r   r   r   r   z=Couldn't reach server at '{}' to download pretrained weights.zModel name '{}' was not found in model name list ({}). We assumed '{}' was a path or url to model weight files named one of {} but couldn't find any such file at this path or url.z, zloading weights file {}z(loading weights file {} from cache at {}cpu)Zmap_locationzUnable to load weights from pytorch checkpoint file. If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ir   )$load_tf2_checkpoint_in_pytorch_model)Zallow_missing_keyszLoading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.gammar   betar   	_metadatarr   )r@   c              	      sh   d kri n |d d i }| ||d  | j D ]"\}}|d k	r@||| d  q@d S )Nrc   T.)r   Z_load_from_state_dictZ_modulesr   )r@   prefixZlocal_metadatanamechild)
error_msgsloadmetadatamissing_keysr   unexpected_keysr   r   r     s           z-PreTrainedModel.from_pretrained.<locals>.loadc                 3   s   | ]}|  jV  qd S r   )
startswithrx   )r/   sclsr   r   r1     s     z2PreTrainedModel.from_pretrained.<locals>.<genexpr>r   )r   c                    s    g | ]}|  jd  d qS )r   rc   )splitrx   )r/   keyr   r   r   
<listcomp>  s    z3PreTrainedModel.from_pretrained.<locals>.<listcomp>z7Weights of {} not initialized from pretrained model: {}z0Weights from pretrained model not used in {}: {}z*Error(s) in loading state_dict for {}:
	{}z
	)r   r   r   r   )rr   ):popru   r   config_classfrom_pretrainedpretrained_model_archive_mapr;   r   r   isfiler   r   r   r   EnvironmentErrorr_   r   rk   r   r   keysr   r   r[   r   	ExceptionOSErrorendswithZload_tf_weightsZtransformersr   r9   errorreplaceappendziprw   copyr   r   ModuleanyrF   rx   r   r"   r   extendlenRuntimeErrorr   evalr   r   r   r   Zsend_cpu_data_to_devicerV   ) r   Zpretrained_model_name_or_pathZ
model_argsr   rZ   r   r   r   r   r   r   r   r   Zconfig_pathZmodel_kwargsZarchive_fileZresolved_archive_filemsgmodelr   Zold_keysZnew_keysr   Znew_keyZold_keyZstart_prefixZmodel_to_loadZhas_prefix_moduleZbase_model_state_dictZ)head_model_state_dict_without_base_prefixZloading_infor   r   )r   r   r   r   r   r   r   r   r     sT   G
	

"








 



   zPreTrainedModel.from_pretrainedc                 K   s   d|iS )Nrs   r   )r   rs   r   r   r   r   prepare_inputs_for_generation  s    z-PreTrainedModel.prepare_inputs_for_generationc                 K   s   |S r   r   )r   logitsr   r   r   r   prepare_logits_for_generation  s    z-PreTrainedModel.prepare_logits_for_generationc                 C   s8   t |dks|dkrdS t| jdr4| jjdkr4dS dS )zWDuring generation, decide whether to pass the `past` variable to the next forward pass.r
   Fmem_lenr   T)r   rF   rZ   r   )r   outputs	use_cacher   r   r   
_use_cache  s
    zPreTrainedModel._use_cachec                 C   sf   t || D ]T}t||  D ]>}|||f dk rJ|||f  |9  < q |||f  |  < q qdS )zGrepetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). r   N)ranger   tolist)r   Zlprobsra   	num_beamsZprev_output_tokensrepetition_penaltyiZprevious_tokenr   r   r   enforce_repetition_penalty_  s
    z+PreTrainedModel.enforce_repetition_penalty_c                  K   s(  |   dkrtd|dk	r |n| jj}|dk	r4|n| jj}|dk	rH|n| jj}|dk	r\|n| jj}|dk	rp|n| jj}|dk	r|n| jj}|dk	r|n| jj	}|dk	r|n| jj
}|	dk	r|	n| jj}	|
dk	r|
n| jj}
|dk	r|n| jj}|dk	r|n| jj}|dk	r|n| jj}|dk	r(|n| jj}|dk	r>|n| jj}|dk	rT|n| jj}|dk	rj|n| jj}|dk	r|n| jj}|dk	r|jd }nd}t|tr|dkstdt|tr|dkstdt|tstdt|tstdt|tstd	t|tr0|dks8td
|dksJtdt|tr`|dkshtdd|	  krdksn td|
dkstd|dk	st|tr|dkstd|dkst|tr|dkstd|dkst|tr|dkstd|dks&tdt|tr<|dksDtdt|trZ|dksbtd|dkst|trt|d tstd|dkrt|tr|dkstdtj|df|tjt|  j d}n|! dkstd|dkr,|dkr|dks,tdn||ks,td|dkrZ|dk	rZ||krZ|"| }n|dkrp|#|j}|dkr|dk	rt$%d&| |}t'| jdr| jj(}n2| jj)rt'| jd rt'| jj*dr| jj*j(}|r|| }|}n|}d}| jj)rr|dkr|}|dk	s(td!t'| d"sBtd#&| t+| j,s^td$&| j,| , }|||d%}|dks|dkr|jd& }|-d.||| |}|-d.||| |}|/ 0|| |}|/ 0|| |}| jj)rtj|| df|tjt|  j d}d}||d jd ksPtd'| d(|d jd  d)t1|0d&d2d|| 0d&3|j }|d 4d|f|dd }nd}|jd& }|dkr| j5|||||||||	|
|||||||||||||||d*}n2| j6||||||||	|
|||||||||||d+}|S ),a   Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling.

        Adapted in part from `Facebook's XLM beam search code`_.

        .. _`Facebook's XLM beam search code`:
           https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529


        Parameters:

            input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)`
                The sequence used as a prompt for the generation. If `None` the method initializes
                it as an empty `torch.LongTensor` of shape `(1,)`.

            max_length: (`optional`) int
                The max length of the sequence to be generated.  Between `min_length` and infinity. Default to 20.

            min_length: (`optional`) int
                The min length of the sequence to be generated.  Between 0 and infinity. Default to 0.

            do_sample: (`optional`) bool
                If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.

            early_stopping: (`optional`) bool
                if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.

            num_beams: (`optional`) int
                Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.

            temperature: (`optional`) float
                The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.

            top_k: (`optional`) int
                The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.

            top_p: (`optional`) float
                The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.

            repetition_penalty: (`optional`) float
                The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.

            pad_token_id: (`optional`) int
                Padding token. Default to specicic model pad_token_id or None if it does not exist.

            bos_token_id: (`optional`) int
                BOS token. Defaults to `bos_token_id` as defined in the models config.

            eos_token_id: (`optional`) int
                EOS token. Defaults to `eos_token_id` as defined in the models config.

            length_penalty: (`optional`) float
                Exponential penalty to the length. Default to 1.

            no_repeat_ngram_size: (`optional`) int
                If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once.
            bad_words_ids: (`optional`) list of lists of int
                `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.

            num_return_sequences: (`optional`) int
                The number of independently computed returned sequences for each element in the batch. Default to 1.

            attention_mask (`optional`) obj: `torch.LongTensor` of same shape as `input_ids`
                Mask to avoid performing attention on padding token indices.
                Mask values selected in ``[0, 1]``:
                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
                Defaults to `None`.

                `What are attention masks? <../glossary.html#attention-mask>`__

            decoder_start_token_id=None: (`optional`) int
                If an encoder-decoder model starts decoding with a different token than BOS.
                Defaults to `None` and is changed to `BOS` later.

            use_cache: (`optional`) bool
                If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`.

            model_specific_kwargs: (`optional`) dict
                Additional model specific kwargs will be forwarded to the `forward` function of the model.

        Return:

            output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`
                sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id`

        Examples::

            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
            outputs = model.generate(max_length=40)  # do greedy decoding
            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
            input_context = 'The dog'
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
            for i in range(3): #  3 output sequences were generated
                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
            input_context = 'The dog'
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3)  # 3 generate sequences using by sampling
            for i in range(3): #  3 output sequences were generated
                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
            model = AutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
        Na$  You tried to generate sequences with a model that does not have a LM Head.Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )r   r
   z3`max_length` should be a strictly positive integer.z*`min_length` should be a positive integer.z `do_sample` should be a boolean.z%`early_stopping` should be a boolean.z `use_cache` should be a boolean.z2`num_beams` should be a strictly positive integer.z*`temperature` should be strictly positive.z%`top_k` should be a positive integer.z"`top_p` should be between 0 and 1.rS   z$`repetition_penalty` should be >= 1.zIIf input_ids is not defined, `bos_token_id` should be a positive integer.z,`pad_token_id` should be a positive integer.z,`eos_token_id` should be a positive integer.z-`length_penalty` should be strictly positive.z4`no_repeat_ngram_size` should be a positive integer.z=`num_return_sequences` should be a strictly positive integer.zZ`bad_words_ids` is either `None` or a list of lists of tokens that should not be generatedzyou should either supply a context to complete as `input_ids` input or a `bos_token_id` (integer >= 0) as a first token to start the generation.r   r   rQ   z>Input prompt should be of shape (batch_size, sequence length).FzGreedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1zxGreedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequenceszHSetting `pad_token_id` to {} (first `eos_token_id`) to generate sequencer   decoderzWdecoder_start_token_id or bos_token_id has to be defined for encoder-decoder generationget_encoderz/{} should have a 'get_encoder' function definedz{} should be a method)rX   rc   z5expected encoder_outputs[0] to have 1st dimension bs=z, got  )cur_len
max_length
min_length	do_sampleearly_stoppingtemperaturetop_ktop_pr   no_repeat_ngram_sizebad_words_idsbos_token_idpad_token_iddecoder_start_token_ideos_token_idra   num_return_sequenceslength_penaltyr   r   encoder_outputsrX   r   model_specific_kwargs)r   r   r   r   r   r   r   r   r   r   r   r   r   r   ra   r   rX   r   r  )7r~   AttributeErrorrZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r`   ru   rm   rk   rl   r   r[   fulllongrM   r3   r   rU   nenew_onesr   warningr_   rF   r   is_encoder_decoderr   callabler   re   rj   
contiguousviewr\   r]   rV   index_select_generate_beam_search_generate_no_beam_search) r   rs   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rX   r   r   r  ra   r   Zeffective_batch_sizeZeffective_batch_multencoderr   Zinput_ids_lenr   Zexpanded_batch_idxsoutputr   r   r   generate   s    
"



   









    


   

zPreTrainedModel.generatec           %      C   s  | |d}| ||}|}||k rZ| j|f|||d|}| f |}|d dddddf }| ||r|d }|	dkr| ||d||	 |
dkrt|||
|}t|D ]}td |||| f< q|dk	rt||}t|D ]}td |||| f< q|dk	r6||k r6td |dd|f< |r|dkrN|| }t	|||d}t
j|dd	}tj|dd
d}ntj|dd	}|dk	r|| |d|   }n|}tj||dgdd	}|dk	r||k} ||   }!||!|d  ||    | dkr qZ| jjdkrPtj|||jd dfgdd	}|d }q$|  |  kr|dk	std| ||  |}"n|}"t|D ]*\}#}$|$d||#  |"|#d||# f< q|"S )z Generate sequences for each example without beam search (num_beams == 1).
            All returned sequence are generated independantly.
        r
   pastrX   r   r   Nrc   rS   inf)r   r   rU   Znum_samplesFzB`Pad_token_id` has to be defined if batches have different lengths) newfill_r   r   r   calc_banned_ngram_tokensr   floatcalc_banned_bad_words_idstop_k_top_p_filteringFsoftmaxr[   multinomialsqueezeZargmaxcatre   mulr  rl   Zmasked_fill_Zmul_maxrZ   r  r  r`   r   itemrk   	enumerate)%r   rs   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ra   r   rX   r   r  Zunfinished_sentssent_lengthsr  model_inputsr   next_token_logitsbanned_tokens	batch_idxprobsZ
next_tokenZtokens_to_addZeos_in_sentsZ+is_sents_unfinished_and_token_to_add_is_eosdecodedhypo_idxhypor   r   r   r    sx    
  





 
$z(PreTrainedModel._generate_no_beam_searchc           E   	      s   fddt |D }tj|ftj|jd}|dkrPd|ddddf< |d}|}d	d t |D }|k rp| j|f|||d
|}| f |}|d dddddf } | ||r|d }|
dkr| | |||
 |dkr| | } | j	j
r|dkr| j| |d} tj| dd}!dk	rR||k rRtd |!ddf< |dkr| }"t||"||}#t|#D ]\}$}%td |!|$|%f< qz|dk	rt||}%t|%D ]\}$}%td |!|$|%f< q|!j| fks td|!j| f|r|!|dddf |! }&t|&||	dd}&|& | }&tj|&dd}'tj|'d d}(t|&d|(})tj|)ddd\})}*t|(d|*}(nF|!|dddf |! })|)| })tj|)d dddd\})}(|) |(   kr|d fks
n tg }+t |D ]~},||, r|t||, ksFtddk	rZ|dk	sbtd|+d|dfg  qg }-tt|(|, |)|, D ]\}.\}/}0|/ }1|/ }2|, |1 }3dk	r|2  kr|.k}4|4rq||, !||3 " |0   n|-#|0|2|3f t|-kr q.q||, pT||, j$|)|, %   |d||,< t|-ksptd|+|- t|+|,d  kstqt&|rqpt|+| kst|'dd |+D }|'dd |+D }5|'dd |+D }6||6ddf }tj(||5)dgdd}|dk	r6| *||6}| j	j
dkrftj(||+|jd dfgdd}|d }qpt |D ]},||, rqxdk	r
t&fdd|(|, D r
t&|)|,df |||, ks
td |)dddf |, |||, t D ]6}1|, |1 }3||3   }7||3 }8||, !|8|7 qqx|rX|n|| }9|rjdn|}:|'|9};g }<t|D ]Z\}$}=t,|=j-d!d" d#}>t |:D ]4}?|:|$ |? }@|>. d }At|A|;|@< |<#|A qq|;/   |;%   kr||dk	std$t/|;%   d }B|'|9|B0|}Ct|<D ]<\}$}D|D|C|$d|;|$ f< |;|$ k r<|C|$|;|$ f< q<n:fd%d|<D stt1|<2tj34t5| 6 j}C|CS )&z? Generate sequences for each example with beam search.
        c                    s   g | ]}t  d qS ))r   )BeamHypothesesr/   _)r   r   r   r   r   r   r     s   z9PreTrainedModel._generate_beam_search.<locals>.<listcomp>r   FrT   Nr
   rc   c                 S   s   g | ]}d qS )Fr   r0  r   r   r   r   +  s     r  r   rS   )r   r   r  r  zShapes of scores: {} != {}rQ   )r   r   min_tokens_to_keepr  T)
descendingrU   )rU   largestsortedz?Batch can only be done if at least {} beams have been generatedzMgenerated beams >= num_beams -> eos_token_id and pad_token have to be defined)r   zBeam should always be fullc                 S   s   g | ]}|d  qS )r   r   r/   r,   r   r   r   r     s     c                 S   s   g | ]}|d  qS )r
   r   r6  r   r   r   r     s     c                 S   s   g | ]}|d  qS )rQ   r   r6  r   r   r   r     s     c                 3   s   | ]}|    k	V  qd S r   )r$  )r/   token_id)r   r   r   r   r1     s    z8PreTrainedModel._generate_beam_search.<locals>.<genexpr>z\If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}c                 S   s   | d S rL   r   r+   r   r   r   r-     r.   z7PreTrainedModel._generate_beam_search.<locals>.<lambda>)r   z `Pad_token_id` has to be definedc                 3   s   | ]}t | kV  qd S r   r   )r/   r.  )r   r   r   r1     s     )7r   r[   zerosr  r   r  r   r   r   rZ   r  r   r  Zlog_softmaxr  r%  r  r`   rk   r_   	expand_asr  r
  r  r  gathersorttopkr   r   r   r   r$  addr   r   is_doner#  allr  r!  re   _reorder_cacher  r5  beamsr   r   r  stacktyper  rV   rM   r3   )Er   rs   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ra   r   r   r   r   r   rX   r   r  Zgenerated_hypsZbeam_scoresr  doner'  r   r(  ZscoresZnum_batch_hypothesesZbanned_batch_tokensr   r)  Z_scoresr+  Znext_tokensZnext_scoresZnext_scores_indicesZnext_batch_beamr*  Znext_sent_beamZbeam_token_rankZbeam_token_idZbeam_token_scoreZbeam_idr7  Zeffective_beam_idZ&is_beam_token_worse_than_top_num_beamsZbeam_tokensbeam_idxZfinal_scoreZfinal_tokensZoutput_batch_sizeZ%output_num_return_sequences_per_batchr&  bestZ
hypothesesZsorted_hypsjZeffective_batch_idxZbest_hypZsent_max_lenr,  r.  r   )r   r   r   r   r   r   r   r    sj    

  
      
   

 
     .

 
 



 


 
"z%PreTrainedModel._generate_beam_search)r  rF  r)   c                    s   t  fdd| D S )Nc                 3   s   | ]}| d  V  qdS )r
   N)r  )r/   Z
layer_pastrF  r   r   r1     s     z1PreTrainedModel._reorder_cache.<locals>.<genexpr>)rp   )r  rF  r   rI  r   rA    s    zPreTrainedModel._reorder_cache)N)N)NNNNNNNNNNNNNNNNNNNN)&r"   r#   r$   r%   r   r   rx   ro   rt   r   ry   rz   r|   r~   r   r   r   r   r   r   r   r   classmethodr   r   r   r   r   r[   Zno_gradr  r  r  rn   r   r   rA  r&   r   r   r   r   rq      sv   





""
  %
                       y  rq   )prev_input_ids	num_hyposr   r   r)   c           	         s   d k rdd t |D S dd t |D t |D ]b}|  | }tfddt D  D ].}t|dd }||g |d g ||< qjq8fdd	  fd
dt |D }|S )z6Copied from fairseq for no_repeat_ngram in beam_searchr
   c                 S   s   g | ]}g qS r   r   r0  r   r   r   r     s     z,calc_banned_ngram_tokens.<locals>.<listcomp>c                 S   s   g | ]}i qS r   r   r0  r   r   r   r     s     c                    s   g | ]} |d  qS r   r   )r/   r   )
gen_tokensr   r   r     s     Nrc   c                    s4    d  }t | | f  }|  |g S Nr
   )rp   r   r   )r-  Z	start_idxZ	ngram_idx)r   generated_ngramsr   rK  r   r   _get_generated_ngrams  s    z7calc_banned_ngram_tokens.<locals>._get_generated_ngramsc                    s   g | ]} |qS r   r   )r/   r-  )rP  r   r   r   %  s     )r   r   r   rp   r   )	rK  rL  r   r   idxZgenerated_ngramZngramZprev_ngram_tupler)  r   )rP  r   rM  rO  r   rK  r   r    s    r  c                    sz   g } fdd} D ]`}g }|D ]H}t |dks>td||| |d d dkrZq ||d  q || q|S )Nc                    sF   t |dkrdS t |t  kr$dS | t | d  |kr>dS dS d S )Nr   TFr8  )Zprev_tokenstokensrK  r   r   _tokens_match,  s    z0calc_banned_bad_words_ids.<locals>._tokens_matchr   z9Banned words token sequences {} cannot have an empty listrc   F)r   rk   r_   r   r   )rK  r   r)  rT  Zprev_input_ids_sliceZbanned_tokens_sliceZbanned_token_seqr   rS  r   r  )  s    r  rS   ZInfc           
      C   s   |dkr>t t||| d}| t| |d d k }|| |< |dk rtj| dd\}}tjtj|dddd}||k}	|dkrd|	d	d
|f< |	d	d
df 	 |	d	dd
f< d|	d< |	
d||	}|| |< | S )a   Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (batch size, vocabulary size)
            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
            Make sure we keep at least min_tokens_to_keep per batch example in the output
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    r   rc   ).rc   NrS   T)r3  r  r
   .N).r   )r   r#  r   r[   r=  r<  cumsumr  r  r   Zscatter)
r   r   r   Zfilter_valuer2  Zindices_to_removeZsorted_logitsZsorted_indicesZcumulative_probsZsorted_indices_to_remover   r   r   r  M  s    
 r  c                   @   s.   e Zd Zdd Zdd Zdd Zd
dd	ZdS )r/  c                 C   s,   |d | _ || _|| _|| _g | _d| _dS )z7
        Initialize n-best list of hypotheses.
        r
   g    eAN)r   r   r   r   rB  worst_score)r   r   r   r   r   r   r   r   r   q  s    
zBeamHypotheses.__init__c                 C   s
   t | jS )z3
        Number of hypotheses in the list.
        )r   rB  rN   r   r   r   __len__|  s    zBeamHypotheses.__len__c                 C   s   |t || j  }t | | jk s*|| jkr| j||f t | | jkrtdd t| jD }| j|d d = |d d | _nt|| j| _dS )z3
        Add a new hypothesis to the list.
        c                 S   s   g | ]\}\}}||fqS r   r   )r/   rQ  r   r1  r   r   r   r     s    
 z&BeamHypotheses.add.<locals>.<listcomp>r   r
   N)	r   r   r   rV  rB  r   r5  r%  r   )r   ZhypZsum_logprobsZscoreZsorted_scoresr   r   r   r>    s    zBeamHypotheses.addNc                 C   sJ   t | | jk rdS | jrdS |dkr*| j}||| j  }| j|k}|S dS )z
        If there are enough hypotheses and that none of the hypotheses being generated
        can become better than the worst one in the heap, then we are done with this sentence.
        FTN)r   r   r   r   r   rV  )r   Zbest_sum_logprobsr   Z	cur_scoreretr   r   r   r?    s    
zBeamHypotheses.is_done)N)r"   r#   r$   r   rW  r>  r?  r   r   r   r   r/  p  s   r/  c                       s$   e Zd Z fddZdd Z  ZS )Conv1Dc                    sN   t    || _t||}tjj|dd t|| _	tt
|| _dS )z Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
            Basically works like a Linear layer but the weights are transposed
        g{Gz?)ZstdN)r   r   nfr[   emptyr   initZnormal_r   r   r9  r   )r   rZ  Znxwr   r   r   r     s    
zConv1D.__init__c              	   C   sF   |  d d | jf }t| j|d| d| j}|j| }|S )Nrc   )r   rZ  r[   Zaddmmr   r  r   )r   r,   Zsize_outr   r   r   r      s     
zConv1D.forward)r"   r#   r$   r   r    r&   r   r   r   r   rY    s   rY  c                       s*   e Zd ZdZ fddZdddZ  ZS )PoolerStartLogitsz9 Compute SQuAD start_logits from sequence hidden states. c                    s   t    t|jd| _d S rN  )r   r   r   Linearhidden_sizedenser   rZ   r   r   r   r     s    
zPoolerStartLogits.__init__Nc                 C   sZ   |  |d}|dk	rVt|  jtjkrB|d|  d|  }n|d|  d|  }|S )z Args:
            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
                invalid position mask such as query and special symbols (PAD, SEP, CLS)
                1.0 means token should be masked.
        rc   Nr
     ꌠ9Y>)F)ra  r   rM   r3   r   r[   float16)r   hidden_statesp_maskr,   r   r   r   r      s    zPoolerStartLogits.forward)Nr!   r   r   r   r   r^    s   r^  c                       s*   e Zd ZdZ fddZdddZ  ZS )PoolerEndLogitszX Compute SQuAD end_logits from sequence hidden states and start token hidden state.
    c                    sR   t    t|jd |j| _t | _tj|j|j	d| _t|jd| _
d S )NrQ   )Zepsr
   )r   r   r   r_  r`  dense_0Tanh
activation	LayerNormZlayer_norm_epsdense_1rb  r   r   r   r     s
    

zPoolerEndLogits.__init__Nc                 C   s   |dk	s|dk	st d|dk	rh|jdd \}}|ddddf dd|}|d|}|d|d}| tj||gdd}| |}| |}| 	|
d}|dk	rt|  jtjkr|d|  d|  }n|d|  d|  }|S )	a   Args:
            One of ``start_states``, ``start_positions`` should be not None.
            If both are set, ``start_positions`` overrides ``start_states``.

            **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
                hidden states of the first tokens for the labeled span.
            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
                position of the first token for the labeled span:
            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
                Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
                1.0 means token should be masked.
        N7One of start_states, start_positions should be not Nonerc   r  r
   rc  rd  )rk   r`   rj   r;  ri  r[   r!  rk  rl  rm  r   rM   r3   r   re  )r   rf  start_statesstart_positionsrg  slenhszr,   r   r   r   r      s(    

zPoolerEndLogits.forward)NNNr!   r   r   r   r   rh    s   rh  c                       s*   e Zd ZdZ fddZdddZ  ZS )PoolerAnswerClasszT Compute SQuAD 2.0 answer class from classification and start tokens hidden states. c                    sB   t    t|jd |j| _t | _tj|jddd| _d S )NrQ   r
   Fr   )	r   r   r   r_  r`  ri  rj  rk  rm  rb  r   r   r   r     s    

zPoolerAnswerClass.__init__Nc                 C   s   |j d }|dk	s"|dk	s"td|dk	rX|ddddf dd|}|d|d}|dk	r|ddddf dd|}|d|d}n|dddddf }| tj||gdd}| |}| 	|d}|S )a  
        Args:
            One of ``start_states``, ``start_positions`` should be not None.
            If both are set, ``start_positions`` overrides ``start_states``.

            **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
                hidden states of the first tokens for the labeled span.
            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
                position of the first token for the labeled span.
            **cls_index**: torch.LongTensor of shape ``(batch_size,)``
                position of the CLS token. If None, take the last token.

            note(Original repo):
                no dependency on end_feature so that we can obtain one single `cls_logits`
                for each sample
        rc   Nrn  ro  r  )
r`   rk   rj   r;  r   ri  r[   r!  rk  rm  )r   rf  rp  rq  	cls_indexrs  Zcls_token_stater,   r   r   r   r      s$    

zPoolerAnswerClass.forward)NNNr!   r   r   r   r   rt    s   rt  c                       s*   e Zd ZdZ fddZdddZ  ZS )	SQuADHeada   A SQuAD head inspired by XLNet.

    Parameters:
        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.

    Inputs:
        **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
            hidden states of sequence tokens
        **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
            position of the first token for the labeled span.
        **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
            position of the last token for the labeled span.
        **cls_index**: torch.LongTensor of shape ``(batch_size,)``
            position of the CLS token. If None, take the last token.
        **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)``
            Whether the question has a possible answer in the paragraph or not.
        **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
            Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
            1.0 means token should be masked.

    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
        **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
            Indices for the top config.start_n_top start token possibilities (beam-search).
        **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
        **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
        **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
            ``torch.FloatTensor`` of shape ``(batch_size,)``
            Log probabilities for the ``is_impossible`` label of the answers.
    c                    s<   t    |j| _|j| _t|| _t|| _t|| _	d S r   )
r   r   start_n_top	end_n_topr^  start_logitsrh  
end_logitsrt  answer_classrb  r   r   r   r   T  s    


zSQuADHead.__init__Nc                 C   s  d}| j ||d}|d k	r|d k	r||||fD ]"}	|	d k	r.|	 dkr.|	d q.| j|||d}
t }|||}||
|}|| d }|d k	r|d k	r| j|||d}t }|||}||d 7 }|f| }n| \}}}t	j
|dd	}tj|| jdd	\}}|ddd|}t|d
|}|dd|dd}|d|}|d k	rb|dnd }| j|||d}
t	j
|
dd	}tj|| jdd	\}}|d| j| j }|d| j| j }td||}| j|||d}|||||f| }|S )Nr   )rg  r
   rc   )rq  rg  rQ   )rq  rv  g      ?r  ro  )rp  rg  z
blh,bl->bh)rp  rv  )rz  rU   Zsqueeze_r{  r   r|  r   ZBCEWithLogitsLossr   r  r  r[   r=  rx  re   rj   r;  r:  ry  r  Zeinsum)r   rf  rq  Zend_positionsrv  Zis_impossiblerg  r   rz  r,   r{  Zloss_fctZ
start_lossZend_lossZ
total_lossZ
cls_logitsZloss_fct_clsZcls_lossZbszrr  rs  Zstart_log_probsZstart_top_log_probsZstart_top_indexZstart_top_index_exprp  Zhidden_states_expandedZend_log_probsZend_top_log_probsZend_top_indexr   r   r   r    ]  sX    


  

  
zSQuADHead.forward)NNNNNr!   r   r   r   r   rw  +  s   (
         rw  c                       s0   e Zd ZdZed fddZdddZ  ZS )	SequenceSummarya-   Compute a single vector summary of a sequence hidden states according to various possibilities:
        Args of the config class:
            summary_type:
                - 'last' => [default] take the last token hidden state (like XLNet)
                - 'first' => take the first token hidden state (like Bert)
                - 'mean' => take the mean of all tokens hidden states
                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
                - 'attn' => Not implemented now, use multi-head attention
            summary_use_proj: Add a projection after the vector extraction
            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
            summary_activation: 'tanh' or another string => add an activation to the output, Other => no activation. Default
            summary_first_dropout: Add a dropout before the projection and activation
            summary_last_dropout: Add a dropout after the projection and activation
    )rZ   c                    s   t    t|dd| _| jdkr&tt | _t|drv|jrvt|dr`|j	r`|j
dkr`|j
}n|j}t|j|| _t|dd }|rt|nt | _t | _t|dr|jdkrt|j| _t | _t|d	r|jdkrt|j| _d S )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   Zsummary_activationsummary_first_dropoutsummary_last_dropout)r   r   rw   r~  r{   r   summaryrF   r  r  Z
num_labelsr`  r   r_  r   rk  first_dropoutr  ZDropoutlast_dropoutr  )r   rZ   Znum_classesZactivation_stringr   r   r   r     s$    

zSequenceSummary.__init__Nc                 C   s  | j dkr|dddf }n| j dkr8|dddf }n| j dkrP|jdd}n| j d	kr|dkrtj|d
ddddf |jd d tjd}n2|dd}|d| d  |	df }|
d|d}n| j dkrt| |}| |}| |}| |}|S )a   hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer.
            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
                if summary_type == 'cls_index' and cls_index is None:
                    we take the last token of the sequence as classification token
        r  Nrc   firstr   meanr
   r  rv  .ro  rR   )rc   r  )r~  r  r[   Z	full_liker`   r  re   rj   rU   r   r;  r   r{   r  r  rk  r  )r   rf  rv  r  r   r   r   r      s&    



0"




zSequenceSummary.forward)N)r"   r#   r$   r%   r   r   r    r&   r   r   r   r   r}    s   r}  c                 C   s2   |  | }tj|dd|| }| | S )z Replace non-padding symbols with their position numbers. Position numbers begin at
    padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
    `utils.make_positions`.

    :param torch.Tensor x:
    :return torch.Tensor:
    r
   r  )r  rm   r[   rU  Ztype_asr  )rs   Zpadding_idxmaskZincremental_indicesr   r   r   "create_position_ids_from_input_ids  s    	r  c                 C   s   | | jj}| j||  }| jdk	rX|dkrF| j  }n| j|   }t| j }t	|||< t
j|d |d | jdk	d | jj}d|j_|j|  d|j_| jdk	rd|j_|j|  d|j_|S )z Prune a linear layer (a model parameters) to keep only entries in index.
        Return the pruned layer as a new layer with requires_grad=True.
        Used to remove heads.
    Nr
   r   ru  FT)rV   r   r   r  r   detachr   r   r   r   r   r_  r*   copy_r
  r   indexrU   WbZnew_sizeZ	new_layerr   r   r   prune_linear_layer  s"    
(
r  c                 C   s   | | jj}| j||  }|dkr<| j  }n| j|   }t| j }t	|||< t
|d |d  | jj}d|j_|j|  d|j_d|j_|j|  d|j_|S )a   Prune a Conv1D layer (a model parameters) to keep only entries in index.
        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
        Return the pruned layer as a new layer with requires_grad=True.
        Used to remove heads.
    r   r
   FT)rV   r   r   r  r   r  r   r   r   r   rY  r*   r  r
  r  r   r   r   prune_conv1d_layer  s    r  c                 C   s^   t | tjr&t| ||dkrdn|dS t | trJt| ||dkrBdn|dS td| jdS )z Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index.
        Return the pruned layer as a new layer with requires_grad=True.
        Used to remove heads.
    Nr   r  r
   zCan't prune layer of class {})	ru   r   r_  r  rY  r  r^   r_   r   )r   r  rU   r   r   r   prune_layer,  s
    
r  .)
chunk_size	chunk_dim
forward_fnr)   c                    s  t |dkstd||d jtfdd|D sBtdt tj}|t |ksrtd|t || dkr|d j  |  dkstd|d   | |d j  |  t fdd|D }tfd	dt	| D }t
j| d
S | S )a  
    This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension `chunk_dim`.
    It then applies a layer `forward_fn` to each chunk independently to save memory.
    If the `forward_fn` is independent across the `chunk_dim` this function will yield the
    same result as not applying it.

    Args:
        chunk_size: int - the chunk size of a chunked tensor. `num_chunks` = `len(input_tensors[0]) / chunk_size`
        chunk_dim: int - the dimension over which the input_tensors should be chunked
        forward_fn: fn - the forward fn of the model
        input_tensors: tuple(torch.Tensor) - the input tensors of `forward_fn` which are chunked
    Returns:
        a Tensor with the same shape the foward_fn would have given if applied


    Examples::

        # rename the usual forward() fn to forward_chunk()
        def forward_chunk(self, hidden_states):
            hidden_states = self.decoder(hidden_states)
            return hidden_states

        # implement a chunked forward function
        def forward(self, hidden_states):
            return apply_chunking_to_forward(self.chunk_size_lm_head, self.seq_len_dim, self.forward_chunk, hidden_states)
    r   z${} has to be a tuple/list of tensorsc                 3   s   | ]}|j  kV  qd S r   )r`   r/   Zinput_tensor)tensor_shaper   r   r1   Y  s    z,apply_chunking_to_forward.<locals>.<genexpr>z-All input tenors have to be of the same shapezJforward_chunk_fn expects {} arguments, but only {} input tensors are givenzHThe dimension to be chunked {} has to be a multiple of the chunk size {}c                 3   s   | ]}|j  d V  qdS )r  N)chunkr  )r  
num_chunksr   r   r1   o  s     c                 3   s   | ]} | V  qd S r   r   )r/   Zinput_tensors_chunk)r  r   r   r1   q  s     r  )r   rk   r_   r`   r@  inspect	signaturer3   rp   r   r[   r!  )r  r  r  Zinput_tensorsZnum_args_in_forward_chunk_fnZinput_tensors_chunksZoutput_chunksr   )r  r  r  r  r   apply_chunking_to_forward9  s:    

 
 r  )r   )r
   )N)5r  loggingr;   typingr   r   r[   r   r   r   r   Ztorch.nnr   r	   r  Zactivationsr   Zconfiguration_utilsr   Z
file_utilsr   r   r   r   r   r   r   	getLoggerr"   r   r   r9   r   r'   rq   rm   r  r  r  r  objectr/  rY  r^  rh  rt  rw  r}  r  r  r  r  r  r   r   r   r   <module>   s^   $
           I$#2//uM


  