U
    &c|K                    @   sV  d Z ddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZ ddlmZ eeZG dd	 d	Zd
d ZG dd dejjeZdd Zdd Zdd Z dde!d dfddZ"dd Z#dd Z$G dd de%Z&G dd dejj'j(Z)G d d! d!ejj'j(Z*G d"d# d#ejj'j(Z+d$d% Z,d)d'd(Z-dS )*zTF general model utils.    N)hdf5_format   )PretrainedConfig)DUMMY_INPUTSTF2_WEIGHTS_NAMEWEIGHTS_NAMEcached_pathhf_bucket_urlis_remote_url)$load_pytorch_checkpoint_in_tf2_modelc                   @   s"   e Zd ZdZdeedddZdS )TFModelUtilsMixinzG
    A few utilities for `tf.keras.Model`s, to be used as a mixin.
    F)only_trainablereturnc                 C   s(   |rt tdd | jD S |  S dS )zP
        Get number of (optionally, trainable) parameters in the model.
        c                 s   s   | ]}t |j V  qd S N)npprodshapeas_list).0w r   B/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_utils.py	<genexpr>,   s     z3TFModelUtilsMixin.num_parameters.<locals>.<genexpr>N)intsumZtrainable_variablesZcount_params)selfr   r   r   r   num_parameters'   s    z TFModelUtilsMixin.num_parametersN)F)__name__
__module____qualname____doc__boolr   r   r   r   r   r   r   "   s   r   c                    s    j t dddkr"tdtfdd}| _ t dsRtdt jdrp fd	d
}| _d _tt	j
jdrt	j
j    S )al  
    Decorate a Keras Layer class to support Keras serialization.

    This is done by:
    1. adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at
       serialization time
    2. wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and
       convert it to a config object for the actual layer initializer
    3. registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does
       not need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`

    :param cls: a tf.keras.layers.Layers subclass that accepts a `config` argument to its initializer (typically a
                `TF*MainLayer` class in this project)
    :return: the same class object, with modifications for Keras deserialization.
    config_classNz2Must set `config_class` to use @keras_serializablec                    s   | dd }|r&t|d tr&|d n
|dd }|d k	rL|d k	rLtdnH|d k	rf| f|| n.|d k	r |}| |f|| ntd|| _d S )Ntransformers_configr   configz<Must pass either `config` or `transformers_config`, not bothzLMust pass either `config` (PretrainedConfig) or `transformers_config` (dict))pop
isinstancer   get
ValueError	from_dict_transformers_config)r   argskwargsr#   r$   )r"   initializerr   r   wrapped_initG   s    &

z(keras_serializable.<locals>.wrapped_init
get_configz@Only use @keras_serializable on tf.keras.layers.Layer subclassesZ_is_defaultc                    s    t  |  }| j |d< |S )Nr#   )superr/   r*   Zto_dict)r   cfg)clsr   r   r/   ^   s    z&keras_serializable.<locals>.get_configTregister_keras_serializable)__init__getattrAttributeError	functoolswrapshasattr	TypeErrorr/   Z_keras_serializabletfkerasutilsr3   )r2   r.   r/   r   )r2   r"   r-   r   keras_serializable1   s     
r>   c                       s   e Zd ZdZdZi ZdZedd Z fddZ	dd	 Z
d
d Zd"ddZd#ddZdd Zdd Zedd Zdd Zdd Zd$ddZdd Zdd Zed d! Z  ZS )%TFPreTrainedModela4   Base class for all TF models.

        :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.

        Class attributes (overridden by derived classes):
            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:

                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
                - ``path``: a path (string) to the TensorFlow checkpoint.

            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
    N c                 C   s   dt tiS )zf Dummy inputs to build the network.

        Returns:
            tf.Tensor with dummy inputs
        	input_ids)r;   Zconstantr   r   r   r   r   dummy_inputs   s    zTFPreTrainedModel.dummy_inputsc                    s:   t  j|| t|ts0td| jj| jj|| _d S )NzParameter config in `{}(config)` should be an instance of class `PretrainedConfig`. To create a model from a pretrained model use `model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`)	r0   r4   r&   r   r(   format	__class__r   r$   )r   r$   inputsr,   rE   r   r   r4      s    
 zTFPreTrainedModel.__init__c                 C   s&   t | | j| }|| k	r| S tdS )z
        Returns the model's input embeddings.

        Returns:
            :obj:`tf.keras.layers.Layer`:
                A torch module mapping vocabulary to hidden states.
        N)r5   base_model_prefixget_input_embeddingsNotImplementedError)r   Z
base_modelr   r   r   rI      s    z&TFPreTrainedModel.get_input_embeddingsc                 C   s   dS )z
        Returns the model's output embeddings.

        Returns:
            :obj:`tf.keras.layers.Layer`:
                A torch module mapping hidden states to vocabulary.
        Nr   rB   r   r   r   get_output_embeddings   s    z'TFPreTrainedModel.get_output_embeddingsc                 C   s   dS )a   Build a resized Embedding Variable from a provided token Embedding Module.
            Increasing the size will add newly initialized vectors at the end
            Reducing the size will remove vectors from the end

        Args:
            new_num_tokens: (`optional`) int
                New number of tokens in the embedding matrix.
                Increasing the size will add newly initialized vectors at the end
                Reducing the size will remove vectors from the end
                If not provided or None: return the provided token Embedding Module.
        Return: ``tf.Variable``
            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
        Nr   )r   Zold_embeddingsnew_num_tokensr   r   r   _get_resized_embeddings   s    z)TFPreTrainedModel._get_resized_embeddingsc                 C   s   t dS )a   Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:

            new_num_tokens: (`optional`) int:
                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
                If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model.

        Return: ``tf.Variable``
            Pointer to the input tokens Embeddings Module of the model
        NrJ   )r   rL   r   r   r   resize_token_embeddings   s    z)TFPreTrainedModel.resize_token_embeddingsc                 C   s   t dS )z Prunes heads of the base model.

            Arguments:

                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
        NrN   )r   Zheads_to_pruner   r   r   prune_heads   s    zTFPreTrainedModel.prune_headsc                 C   sL   t j|std| j| t j|t}| | t	
d| dS )z Save a model and its configuration file to a directory, so that it
            can be re-loaded using the :func:`~transformers.PreTrainedModel.from_pretrained` class method.
        zPSaving path should be a directory where the model and configuration can be savedzModel weights saved in {}N)ospathisdirAssertionErrorr$   save_pretrainedjoinr   Zsave_weightsloggerinforD   )r   Zsave_directoryZoutput_model_filer   r   r   rU      s    
z!TFPreTrainedModel.save_pretrainedc                 O   s  | dd}| dd}| dd}| dd}| dd}| dd}	| d	d}
| d
d}t|ts|dk	rv|n|}| jj|f||d||d|\}}n|}|dk	r>|| jkr| j| }ntj|r>tj	tj
|trtj
|t}nB|r(tj	tj
|tr(tj
|t}ntdttg|nRtj	|sVt|r\|}n4tj	|d rx|d }nt||rtnt|d}zt|||||	d}W nf tk
r } zF|| jkrtd| n td|d
| j | |W 5 d}~X Y nX ||kr*td| ntd|| nd}| |f||}|rft||ddS ||jdd tj	|std|z|j|dd W n tk
r   tdY nX ||jdd t|d4}d|jkrd|kr|d }tt |d}W 5 Q R X tdd |j!D }t"|| }t"|| }g }t#|d krjtd!|j$j%| t#|d krtd"|j$j%| t#|d krt&d#|j$j%d$
||
r|||d%}||fS |S )&a[  Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.

        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
        It is up to you to train those weights with a downstream fine-tuning task.

        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.

        Parameters:
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.

            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method

            config: (`optional`) one of:
                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
                    - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:

                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.

            from_pt: (`optional`) boolean, default False:
                Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).

            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.

            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.

            resume_download: (`optional`) boolean, default False:
                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.

            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.

            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.

            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:

                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.

        Examples::

            # For example purposes. Not runnable.
            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)

        r$   N	cache_dirfrom_ptFforce_downloadresume_downloadproxiesoutput_loading_infouse_cdnT)rY   Zreturn_unused_kwargsr[   r\   zFError no file named {} found in directory {} or `from_pt` set to Falsez.index)filenamer_   )rY   r[   r\   r]   z=Couldn't reach server at '{}' to download pretrained weights.zModel name '{}' was not found in model name list ({}). We assumed '{}' was a path or url but couldn't find any file associated to this path or url.z, zloading weights file {}z(loading weights file {} from cache at {})Zallow_missing_keystrainingzError retrieving file {})Zby_namez}Unable to load weights from h5 file. If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. rZlayer_namesZmodel_weightsc                 s   s   | ]}|j V  qd S r   )name)r   Zlayerr   r   r   r     s     z4TFPreTrainedModel.from_pretrained.<locals>.<genexpr>r   z6Layers of {} not initialized from pretrained model: {}z/Layers from pretrained model not used in {}: {}z'Error(s) in loading weights for {}:
	{}z
	)missing_keysunexpected_keys
error_msgs)'r%   r&   r   r"   from_pretrainedpretrained_model_archive_maprQ   rR   rS   isfilerV   r   r   EnvironmentErrorrD   r
   r	   r   rW   errorkeysrX   r   rC   rT   Zload_weightsOSErrorh5pyZFileattrssetr   Zload_attributes_from_hdf5_grouplayerslistlenrE   r   RuntimeError)r2   Zpretrained_model_name_or_pathZ
model_argsr,   r$   rY   rZ   r[   r\   r]   r^   r_   Zconfig_pathZmodel_kwargsZarchive_fileZresolved_archive_fileemodelfZhdf5_layer_namesZmodel_layer_namesre   rf   rg   Zloading_infor   r   r   rh      s    C



 

	

z!TFPreTrainedModel.from_pretrainedc                 K   s   d|iS )NrF   r   )r   rF   r,   r   r   r   prepare_inputs_for_generation  s    z/TFPreTrainedModel.prepare_inputs_for_generationc                 C   s8   t |dks|dkrdS t| jdr4| jjdkr4dS dS )zWDuring generation, decide whether to pass the `past` variable to the next forward pass.r   Fmem_lenr   T)rt   r9   r$   rz   )r   outputs	use_cacher   r   r   
_use_cache  s
    zTFPreTrainedModel._use_cachec                 C   s  |   dkrtd|dk	r |n| jj}|dk	r4|n| jj}|dk	rH|n| jj}|dk	r\|n| jj}|dk	rp|n| jj}|dk	r|n| jj}|dk	r|n| jj	}|dk	r|n| jj
}|	dk	r|	n| jj}	|
dk	r|
n| jj}
|dk	r|n| jj}|dk	r|n| jj}|dk	r|n| jj}|dk	r(|n| jj}|dk	r>|n| jj}|dk	rT|n| jj}|dk	rj|n| jj}|dk	r|n| jj}|dk	rt|d }nd}t|tr|dkstdt|tr|dkstdt|tstdt|tstdt|tstd	t|tr2|dks:td
|dksLtdt|trb|dksjtdd|	  krdksn td|
dkstd|dk	st|tr|dkstd|dkst|tr|dkstd|dkst|tr|dkstd|dks(tdt|tr>|dksFtd|dkstt|trlt|d tsttd|dkrt|tr|dkstdt|df|}ntt|dkstd|dkr|dkr|dkstdn||kstd|dkr@|dk	r@|| kr@tjtj !||tj"d}n|dkrTt#|}|dkr||dk	r|t$%d&| |}t|d }| jj'}|r|| }|}n|}d}| jj(r |dkr|}|dk	stdt)| dstd &| t*| j+std!&| j+| + }|||d"}|dks4|dkrt|d# }t,t-|d||| |f}t,t-|d||| |f}t.||| |f}t.||| |f}| jj(rRtj/|| dftj"d| }d}||d j0d kstd$| d%|d j0d  d&tj.tj1t-t2|d#|| dd'd(d)}tj3|d |dd*f|dd }nd}t|d# }|dkr| j4|||||||||	|
||||||||||||||d+}n2| j5||||||||	|
|||||||||||d,}|S )-ar   Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
        and beam-search.

        Adapted in part from `Facebook's XLM beam search code`_.

        .. _`Facebook's XLM beam search code`:
           https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529


        Parameters:

            input_ids: (`optional`) `tf.Tensor` of `dtype=tf.int32` of shape `(batch_size, sequence_length)`
                The sequence used as a prompt for the generation. If `None` the method initializes
                it as an empty `tf.Tensor` of shape `(1,)`.

            max_length: (`optional`) int
                The max length of the sequence to be generated.  Between 1 and infinity. Default to 20.

            min_length: (`optional`) int
                The min length of the sequence to be generated.  Between 0 and infinity. Default to 0.
            do_sample: (`optional`) bool
                If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.

            early_stopping: (`optional`) bool
                if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.

            num_beams: (`optional`) int
                Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.

            temperature: (`optional`) float
                The value used to module the next token probabilities. Must be strictely positive. Default to 1.0.

            top_k: (`optional`) int
                The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.

            top_p: (`optional`) float
                The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.

            repetition_penalty: (`optional`) float
                The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.

            bos_token_id: (`optional`) int
                Beginning of sentence token if no prompt is provided. Default to specicic model bos_token_id or None if it does not exist.

            pad_token_id: (`optional`) int
                Pad token. Defaults to pad_token_id as defined in the models config.

            eos_token_id: (`optional`) int
                EOS token. Defaults to eos_token_id as defined in the models config.

            length_penalty: (`optional`) float
                Exponential penalty to the length. Default to 1.

            no_repeat_ngram_size: (`optional`) int
                If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once.

            bad_words_ids: (`optional`) list of lists of int
                `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.

            num_return_sequences: (`optional`) int
                The number of independently computed returned sequences for each element in the batch. Default to 1.

            attention_mask (`optional`) obj: `tf.Tensor` with `dtype=tf.int32` of same shape as `input_ids`
                Mask to avoid performing attention on padding token indices.
                Mask values selected in ``[0, 1]``:
                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
                Defaults to `None`.

                `What are attention masks? <../glossary.html#attention-mask>`__

            decoder_start_token_id=None: (`optional`) int
                If an encoder-decoder model starts decoding with a different token than BOS.
                Defaults to `None` and is changed to `BOS` later.

            use_cache: (`optional`) bool
                If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`.

        Return:

            output: `tf.Tensor` of `dtype=tf.int32` shape `(batch_size * num_return_sequences, sequence_length)`
                sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id`

        Examples::

            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
            model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
            outputs = model.generate(max_length=40)  # do greedy decoding
            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
            model = TFAutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
            input_context = 'The dog'
            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
            for i in range(3): #  3 output sequences were generated
                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
            model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
            input_context = 'The dog'
            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3)  # 3 generate sequences using by sampling
            for i in range(3): #  3 output sequences were generated
                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
            model = TFAutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
            model = TFAutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
        Na  You tried to generate sequences with a model that does not have a LM Head.Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)r   r   z4`max_length` should be a strictely positive integer.z*`min_length` should be a positive integer.z `do_sample` should be a boolean.z%`early_stopping` should be a boolean.z `use_cache` should be a boolean.z3`num_beams` should be a strictely positive integer.z+`temperature` should be strictely positive.z%`top_k` should be a positive integer.z"`top_p` should be between 0 and 1.      ?z$`repetition_penalty` should be >= 1.zIIf input_ids is not defined, `bos_token_id` should be a positive integer.z,`pad_token_id` should be a positive integer.z,`eos_token_id` should be a positive integer.z.`length_penalty` should be strictely positive.z>`num_return_sequences` should be a strictely positive integer.zZ`bad_words_ids` is either `None` or a list of lists of tokens that should not be generatedzyou should either supply a context to complete as `input_ids` input or a `bos_token_id` (integer >= 0) as a first token to start the generation.   z>Input prompt should be of shape (batch_size, sequence length).FzGreedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1zxGreedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequencesdtypezHSetting `pad_token_id` to {} (first `eos_token_id`) to generate sequencezWdecoder_start_token_id or bos_token_id has to be defined for encoder-decoder generationget_encoderz/{} should have a 'get_encoder' function definedz{} should be a method)attention_maskz5expected encoder_outputs[0] to have 1st dimension bs=z, got  )Zrepeatsaxis)r   )r   r   )cur_len
max_length
min_length	do_sampleearly_stoppingtemperaturetop_ktop_prepetition_penaltyno_repeat_ngram_sizebad_words_idsbos_token_idpad_token_ideos_token_iddecoder_start_token_id
batch_sizenum_return_sequenceslength_penalty	num_beams
vocab_sizeencoder_outputsr   r|   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r|   )6rK   r6   r$   r   r   r   r   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   
shape_listr&   r   rT   r!   rs   r;   fillrt   numpycastmath	not_equalint32	ones_likerW   warningrD   r   is_encoder_decoderr9   callabler   broadcast_toexpand_dimsreshapeonesr   repeatrangegather_generate_beam_search_generate_no_beam_search)r   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r|   r   r   r   Zeffective_batch_sizeZeffective_batch_multencoderr   Zinput_ids_lenZexpanded_batch_idxsoutputr   r   r   generate  s    
"





"




 
  
 

 $
zTFPreTrainedModel.generatec           *   	      s  t |dddf }t |dddf | }|}||k r| j||||d}| f |}|d dddddf }| ||r|d }|	dkrt|||	}t j||}|
dkr
t|||
|}g }|D ]  | fddt	|D  qt
|t j|t jd	td
 }|dk	rht||}g }|D ]" | fddt	|D  q&t
|t j|t jd	td
 }dk	r||k rt jfddt	|D t jd	}t |||g}t
||td
 }|r|dkr|| }t|||d}t jt jj|t jdddd} nt jj|dt jd} dk	r:| | |d|   }!n| }!t |t |!dgd}dk	r|!k}"t j|t |"t j}#|d|#  ||#  }||#8 }t j|dkrq| jjdkrt j|t jt|d dft jd	gdd}|d }q4t j|}$t j|}%|$|%kr|dk	s(tdt j||%  gt jd	| }&t t |d||%g}'t !t t t 	|d||g}(t "|(|'k ||&})n|})|)S )z Generate sequences for each example without beam search (num_beams == 1).
            All returned sequence are generated independantly.
        Nr   pastr   r|   r   r   r~   c                    s   g | ]}| krd ndqS TFr   r   tokenbanned_tokens_slicer   r   
<listcomp>n  s     z>TFPreTrainedModel._generate_no_beam_search.<locals>.<listcomp>r   infc                    s   g | ]}| krd ndqS r   r   r   r   r   r   r   |  s     c                    s   g | ]}| krd ndqS r   r   r   r   r   r   r     s     )r   r   r   Znum_samplesr   )r   output_typeFzB`Pad_token_id` has to be defined if batches have different lengths)#r;   r   ry   r}   #_create_next_token_logits_penaltiesr   multiplycalc_banned_ngram_tokensappendr   set_tensor_by_indices_to_valueconvert_to_tensorr!   floatcalc_banned_bad_words_idsr   tf_top_k_top_p_filteringsqueezerandomcategoricalr   Zargmaxconcatr   r   
reduce_maxr$   r   r   r   
reduce_minrT   r   	transposewhere)*r   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r|   Zunfinished_sentssent_lengthsr   model_inputsr{   next_token_logitsnext_token_logits_penaltiesbanned_tokensbanned_tokens_indices_maskis_token_logit_eos_tokeneos_token_indices_maskZ
next_tokenZtokens_to_addZeos_in_sentsZ+is_sents_unfinished_and_token_to_add_is_eosZmin_sent_lengthZmax_sent_lengthpaddingZbroad_casted_sent_lengthsZbroad_casted_rangedecodedr   )r   r   r   r   3  s    
   
  
  

     
 

 
 


 z*TFPreTrainedModel._generate_no_beam_searchc           L   
      s	  fddt |D }|dkrftj|dftjd}tj|d ftjdd }t||gd}ntj|ftjd}t|| f}|}dd t |D }|k r4| j|||d	}| f |} | d
 dddddf }!| | |r| d }|
dkrt	|!|
}"tj
|!|"}!|dkr,|!| }!tjj|!dd}#dk	r||k r| }$tjfddt D tjd}%t|%|$g}&t|#|&td }#|d
kr| }$t|$||}'g }(|'D ]" |( fddt D  qt|#tj|(tjdtd }#|dk	rdt|}'g }(|'D ]" |( fddt D  q"t|#tj|(tjdtd }#t|#| gks~t|r&|#t|dddf | f })t|)||	dd})t|)| f})tjj|)tjd d}*tj|)|*dd}+tj|+ddd},tj|+|,dd}+tj|*|,dd}*nR|#t|dddf | f }+t|+| f}+tj
j|+d dd\}+}*t|+t|*  kr|d gksn tg }-t |D ]}.||. rt||. kstd dk	r|dk	std|-!d
|d
fg  qg }/t"t#|*|. |+|. D ]\}0\}1}2|1 }3|1 }4|. |3 }5dk	r|4$ kr|0k}6|6rq2||. %t&|5 |2$  n|/|2|4|5f t|/kr2 q̐q2||. p||. j't(|+|. $ |d||.< t|/kstd|-!|/ t|-|.d  kstqt)|rFq4t|-| ks\ttjdd |-D tjd}tjdd |-D tjd}7tjd d |-D tjd}8t*fd!d|8D tjt+|7dgdd|dk	r| ,||8}| j-j.dkr*tj|tjt|d
 dftjdgdd}|d }qt |D ]}.||. rNq<dk	rt)fd"d#|*|. D rt/|+|.df t||f|. kstd$ |+dddf |. t||f|. t D ]:}3|. |3 }5||5 $ 0 }9|5 }:||. %|:|9 qސq<|r(|n|| };|r:dn|}<g }=g }>t"|D ]P\}?}@t1|@j2d%d& d'}At |<D ]*}B|A3 d }C|=t|C |>|C qpqN|;t|>kstd( |;t|>tj|=tjd}Dt4|D$ t(|D$ kr|dk	std)t5t(|D$ d }Eg }Ft"|>D ]\}?}G|D|? t|Gd
 ksHt|D|? |Ekr\|G}Hnv|E|D|?  }I|tj|Iftjd }Jtj|G|Jgdd}H|D|? k rt6tj |Etjd|D|? ktj|Eftjd |H}H|F|H q&t*|F}Kn"fd*d#|>D 	stt*|>}K|KS )+z? Generate sequences for each example with beam search.
        c                    s   g | ]}t  d qS ))r   )BeamHypothesesr   _)r   r   r   r   r   r   r     s   z;TFPreTrainedModel._generate_beam_search.<locals>.<listcomp>Fr   r   g    er   c                 S   s   g | ]}d qS )Fr   r   r   r   r   r     s     r   r   Nr~   r   c                    s   g | ]}| krd ndqS r   r   r   r   r   r   r   )  s     r   c                    s   g | ]}| krd ndqS r   r   r   r   r   r   r   :  s     c                    s   g | ]}| krd ndqS r   r   r   r   r   r   r   H  s     r   )r   r   min_tokens_to_keepr   
batch_dims
DESCENDING)	directionr   T)ksortedz?Batch can only be done if at least {} beams have been generatedzMgenerated beams >= num_beams -> eos_token_id and pad_token have to be defined)r   zBeam should always be fullc                 S   s   g | ]}|d  qS )r   r   r   xr   r   r   r     s     c                 S   s   g | ]}|d  qS )r   r   r   r   r   r   r     s     c                 S   s   g | ]}|d  qS )r   r   r   r   r   r   r     s     c                    s"   g | ]}t  |d d f qS r   )r;   identityr   )rA   r   r   r     s     c                 3   s"   | ]}|     k	V  qd S r   )r   item)r   token_id)r   r   r   r   r     s    z:TFPreTrainedModel._generate_beam_search.<locals>.<genexpr>z\If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}c                 S   s   | d S )Nr   r   )r   r   r   r   <lambda>      z9TFPreTrainedModel._generate_beam_search.<locals>.<lambda>)keyz9Output batch size {} must match output beam hypotheses {}z `Pad_token_id` has to be definedc                 3   s   | ]}t | kV  qd S r   rt   )r   hypo)r   r   r   r     s     )7r   r;   zerosfloat32r   r   r   ry   r}   r   r   r   nnZlog_softmaxr   r!   r   r   r   r   r   r   r   rT   r   r   r   r   r   argsortr   rt   rD   extend	enumeratezipr   addr   is_doner   allstackr   _reorder_cacher$   r   Z
reduce_allr   r   beamsr%   r   minr   )Lr   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r|   Zgenerated_hypsZbeam_scores_beginZbeam_scores_endZbeam_scoresr   doner   r{   r   r   ZscoresZnum_batch_hypothesesr   r   r   r   Z_scoresZnext_tokensZnext_scoresZnext_scores_indicesZnext_batch_beamZ	batch_idxZnext_sent_beamZbeam_token_rankZbeam_token_idZbeam_token_scoreZbeam_idr   Zeffective_beam_idZ&is_beam_token_worse_than_top_num_beamsZbeam_tokensbeam_idxZfinal_scoreZfinal_tokensZoutput_batch_sizeZ%output_num_return_sequences_per_batchZsent_lengths_listbestiZ
hypothesesZsorted_hypsjZbest_hypr   Zsent_max_lenZdecoded_listr   Zdecoded_sliceZnum_pad_tokensr   r   r   )r   r   r   rA   r   r   r   r   r   r     s   
   

  
 
     

   
      
 
.
 
 



 


"  
z'TFPreTrainedModel._generate_beam_searchc                    s   t  fdd| D S )Nc                 3   s   | ]}t j| d dV  qdS )r   r   N)r;   r   )r   Z
layer_pastr   r   r   r     s     z3TFPreTrainedModel._reorder_cache.<locals>.<genexpr>)tuple)r   r   r   r  r   r     s    z TFPreTrainedModel._reorder_cache)N)N)NNNNNNNNNNNNNNNNNNNN)r   r   r   r    r"   ri   rH   propertyrC   r4   rI   rK   rM   rO   rP   rU   classmethodrh   ry   r}   r   r   r   staticmethodr   __classcell__r   r   rG   r   r?   k   s`   


"
	
 G
                    
  k $  Cr?   c           	      C   s   t t|}dd |  D }t|D ]R\}}||  | }t |j}|||dk < d| ||dk< t || || q(tj	|tj
dS )Nc                 S   s   g | ]}t |qS r   )r   unique)r   Zinput_idr   r   r   r     s     z7_create_next_token_logits_penalties.<locals>.<listcomp>r   r   r   )r   r   r   r   r   r   r   putr;   r   r   )	rA   logitsr   Ztoken_penaltiesprev_input_idsr   Zprev_input_idZlogit_penalizedZlogit_penaltiesr   r   r   r     s    r   c           	         s   d k rdd t |D S dd t |D t |D ]f}|   | }tfddt D  D ].}t|d d }||g |d g ||< qnq8fdd  fd	dt |D }|S )
Nr   c                 S   s   g | ]}g qS r   r   r   r   r   r   r   .  s     z,calc_banned_ngram_tokens.<locals>.<listcomp>c                 S   s   g | ]}i qS r   r   r   r   r   r   r   /  s     c                    s   g | ]} |d  qS r   r   )r   r   )
gen_tokensr   r   r   3  s     r   c                    s8    d  }t | | f   }|  |g S )Nr   )r  r   tolistr'   )hypo_idxZ	start_idxZ	ngram_idx)r   generated_ngramsr   r  r   r   _get_generated_ngrams7  s    z7calc_banned_ngram_tokens.<locals>._get_generated_ngramsc                    s   g | ]} |qS r   r   )r   r  )r  r   r   r   =  s     )r   r   r  r   r  r'   )	r  Z	num_hyposr   r   idxZgenerated_ngramZngramZprev_ngram_tupler   r   )r  r   r  r  r   r  r   r   *  s    r   c                    s~   g } fdd} D ]d}g }|D ]L}t |dks>td|||  |d d dkr^q ||d  q || q|S )Nc                    sF   t |dkrdS t |t  kr$dS | t | d  |kr>dS dS d S )Nr   TFr   )Zprev_tokenstokensr  r   r   _tokens_matchD  s    z0calc_banned_bad_words_ids.<locals>._tokens_matchr   z9Banned words token sequences {} cannot have an empty listr   F)rt   rT   rD   r   r  r   )r  r   r   r  Zprev_input_ids_slicer   Zbanned_token_seqr   r  r   r   A  s    r   r~   ZInfc                 C   s8  t | }|dkrLtt|||d }| tjj| |dd d k }t| ||} |dk r4tj| dd}tj| |ddd	}tjj	tj
j|dd
dd
}	|	|k}
|dkrtt|
ddd|f |
dd|df gd}
tj|
ddd
}
tt|
ddddf |
ddddf gd}
t|
|}t| ||} | S )a   Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (batch size, vocabulary size)
            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
            Make sure we keep at least min_tokens_to_keep per batch example in the output
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    r   r   )r   ).r   Nr~   r   )r   r   )r   r   r   N)r   r   maxr;   r   r   r   r   r   Zcumsumr   Zsoftmaxr   
zeros_likeZrollscatter_values_on_batch_indices)r
  r   r   Zfilter_valuer   Zlogits_shapeZindices_to_removeZsorted_indicesZsorted_logitsZcumulative_probsZsorted_indices_to_remover   r   r   r   e  s<    

   	, 
r   c              
   C   sn   t |}tttjt|d dd|ddg}tt|t|ddggd}t|t| dg|S )Nr   r   r   r   )	r   r;   r   r   r   r   r   r   Z
scatter_nd)valuesZbatch_indicesr   Zbroad_casted_batch_dimsZpair_indicesr   r   r   r    s    ,"r  c                 C   s   t | | }t ||| S r   )r;   r  r   )ZtensorindicesvalueZvalue_tensorr   r   r   r     s    r   c                   @   s.   e Zd Zdd Zdd Zdd Zd
dd	ZdS )r   c                 C   s,   |d | _ || _|| _|| _g | _d| _dS )z7
        Initialize n-best list of hypotheses.
        r   g    eAN)r   r   r   r   r   worst_score)r   r   r   r   r   r   r   r   r4     s    
zBeamHypotheses.__init__c                 C   s
   t | jS )z3
        Number of hypotheses in the list.
        )rt   r   rB   r   r   r   __len__  s    zBeamHypotheses.__len__c                 C   s   |t || j  }t | | jk s*|| jkr| j||f t | | jkrtdd t| jD }| j|d d = |d d | _nt|| j| _dS )z3
        Add a new hypothesis to the list.
        c                 S   s   g | ]\}\}}||fqS r   r   )r   r  sr   r   r   r   r     s    
 z&BeamHypotheses.add.<locals>.<listcomp>r   r   N)	rt   r   r   r  r   r   r   r   r   )r   ZhypZsum_logprobsZscoreZsorted_scoresr   r   r   r     s    zBeamHypotheses.addNc                 C   sJ   t | | jk rdS | jrdS |dkr*| j}||| j  }| j|k}|S dS )z
        If there are enough hypotheses and that none of the hypotheses being generated
        can become better than the worst one in the heap, then we are done with this sentence.
        FTN)rt   r   r   r   r   r  )r   Zbest_sum_logprobsr   Z	cur_scoreretr   r   r   r     s    
zBeamHypotheses.is_done)N)r   r   r   r4   r  r   r   r   r   r   r   r     s   r   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	TFConv1D{Gz?c                    s$   t  jf | || _|| _|| _dS )z TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
            Basically works like a Linear layer but the weights are transposed
        N)r0   r4   nfnxinitializer_range)r   r!  r"  r#  r,   rG   r   r   r4     s    zTFConv1D.__init__c                 C   s@   | j d| j| jgt| jd| _| j dd| jgt d| _d S )Nweightr   r-   biasr   )	
add_weightr"  r!  get_initializerr#  r$  r;   Zzeros_initializerr&  r   Zinput_shaper   r   r   build  s     
 zTFConv1D.buildc                 C   sR   t |d d \}}t|d| jg}t|| j| j }t|||| jg}|S )Nr   r   )r   r;   r   r"  matmulr$  r&  r!  )r   r   Zbzslr   r   r   call  s
    zTFConv1D.call)r   )r   r   r   r4   r*  r-  r  r   r   rG   r   r    s   	r  c                       sH   e Zd ZdZd fdd	Z fddZddd	Zd
d Zdd Z  Z	S )TFSharedEmbeddingsz'Construct shared token embeddings.
    Nc                    s4   t  jf | || _|| _|d kr*|d n|| _d S )Ng      )r0   r4   r   hidden_sizer#  )r   r   r/  r#  r,   rG   r   r   r4     s    zTFSharedEmbeddings.__init__c                    s0   | j d| j| jgt| jd| _t | dS )zBuild shared token embedding layer
        Shared weights logic adapted from
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        r$  r%  N)r'  r   r/  r(  r#  r$  r0   r*  r)  rG   r   r   r*    s     
 zTFSharedEmbeddings.build	embeddingc                 C   s6   |dkr|  |S |dkr$| |S td|dS )a	  Get token embeddings of inputs.
        Args:
            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
            mode: string, a valid value is one of "embedding" and "linear".
        Returns:
            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
                linear tensor, float32 with shape [batch_size, length, vocab_size].
        Raises:
            ValueError: if mode is not valid.

        Shared weights logic adapted from
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        r0  Zlinearzmode {} is not valid.N)
_embedding_linearr(   rD   )r   rF   moder   r   r   r-    s
    

zTFSharedEmbeddings.callc                 C   s   t | j|S )z)Applies embedding based on inputs tensor.)r;   r   r$  )r   rA   r   r   r   r1    s    zTFSharedEmbeddings._embeddingc                 C   sH   t |dd }t|d| jg}tj|| jdd}t||| jg S )zComputes logits by running inputs through a linear layer.
            Args:
                inputs: A float32 tensor with shape [..., hidden_size]
            Returns:
                float32 tensor with shape [..., vocab_size].
        Nr   T)Ztranspose_b)r   r;   r   r/  r+  r$  r   )r   rF   Z
first_dimsr   r
  r   r   r   r2  "  s    zTFSharedEmbeddings._linear)N)r0  )
r   r   r   r    r4   r*  r-  r1  r2  r  r   r   rG   r   r.    s   

r.  c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
TFSequenceSummarya   Compute a single vector summary of a sequence hidden states according to various possibilities:
        Args of the config class:
            summary_type:
                - 'last' => [default] take the last token hidden state (like XLNet)
                - 'first' => take the first token hidden state (like Bert)
                - 'mean' => take the mean of all tokens hidden states
                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
                - 'attn' => Not implemented now, use multi-head attention
            summary_use_proj: Add a projection after the vector extraction
            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
            summary_first_dropout: Add a dropout before the projection and activation
            summary_last_dropout: Add a dropout after the projection and activation
    r   c                    s  t  jf | t|dr|jnd| _| jdkr2tt|do@|j| _| jrt|drl|jrl|jdkrl|j}n|j	}t
jjj|t|dd| _t|do|jd	k| _| jrt
jjj| _t|d
o|jdk| _| jrt
jj|j| _t|do|jdk| _| jrt
jj|j| _d S )Nsummary_use_projlastattnsummary_proj_to_labelsr   summary)Zkernel_initializerrd   summary_activationtanhsummary_first_dropoutsummary_last_dropout)r0   r4   r9   summary_typerJ   r5  has_summaryr8  Z
num_labelsr/  r;   r<   rr   ZDenser(  r9  r:  has_activationZactivationsr;  
activationr<  has_first_dropoutZDropoutfirst_dropoutr=  has_last_dropoutlast_dropout)r   r$   r#  r,   Znum_classesrG   r   r   r4   A  s.    
  zTFSequenceSummary.__init__Fc                 C   s  t |tttfs|}d}nZt |ttfr^|d }t|dkrD|d nd}t|dksttdn|d}|dd}| jdkr|ddd	f }n| jd
kr|dddf }n| jdkrtj	|dd}n| jdkrZt
|}|dkrt|dd |d d }t
|}t|t|d kr*|dtjf }tj||t|d d}tj|t|d d}n| jdkrjt| jr| j||d}| jr| |}| jr| |}| jr| j||d}|S )a   hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
                if summary_type == 'cls_index' and cls_index is None:
                    we take the last token of the sequence as classification token
        Nr   r   r   zToo many inputs.hidden_states	cls_indexr6  r   firstZmeanr   .r   r7  ra   )r&   dictr  rs   rt   rT   r'   r>  r;   Zreduce_meanr   r   Znewaxisr   r   rJ   rB  rC  r?  r9  r@  rA  rD  rE  )r   rF   rb   rF  rG  r   Zhidden_shapeZ	cls_shaper   r   r   r-  a  sR    




 
 


zTFSequenceSummary.call)r   )F)r   r   r   r    r4   r-  r  r   r   rG   r   r4  1  s    r4  c                    s*   | j  }t |   fddt|D S )z.Deal with dynamic shape in tensorflow cleanly.c                    s$   g | ]\}}|d kr | n|qS r   r   )r   r   r  Zdynamicr   r   r     s     zshape_list.<locals>.<listcomp>)r   r   r;   r   )r   Zstaticr   rK  r   r     s    

r   r   c                 C   s   t jjj| dS )zCreates a `tf.initializers.truncated_normal` with the given range.
    Args:
        initializer_range: float, initializer range for stddev.
    Returns:
        TruncatedNormal initializer with stddev = `initializer_range`.
    )stddev)r;   r<   ZinitializersZTruncatedNormal)r#  r   r   r   r(    s    r(  )r   ).r    r7   loggingrQ   ro   r   r   Z
tensorflowr;   Ztensorflow.python.keras.savingr   Zconfiguration_utilsr   Z
file_utilsr   r   r   r   r	   r
   Zmodeling_tf_pytorch_utilsr   	getLoggerr   rW   r   r>   r<   ZModelr?   r   r   r   r   r   r  r   objectr   rr   ZLayerr  r.  r4  r   r(  r   r   r   r   <module>   sF    
:         :$2
2=k