U
    &ºc!  ã                   @   sF   d Z ddlZddlmZ e e¡Zddddd	d
œZG dd„ deƒZdS )z OpenAI GPT-2 configuration é    Né   )ÚPretrainedConfigzDhttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.jsonzKhttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.jsonzJhttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.jsonzGhttps://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.jsonzJhttps://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json)Úgpt2zgpt2-mediumz
gpt2-largezgpt2-xlZ
distilgpt2c                       sZ   e Zd ZdZeZdZd‡ fdd„	Zedd„ ƒZ	edd„ ƒZ
edd„ ƒZedd„ ƒZ‡  ZS )Ú
GPT2Configa8  
        This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
        It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
        the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.

        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
        for more information.


        Args:
            vocab_size (:obj:`int`, optional, defaults to 50257):
                Vocabulary size of the GPT-2 model. Defines the different tokens that
                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
            n_positions (:obj:`int`, optional, defaults to 1024):
                The maximum sequence length that this model might ever be used with.
                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
            n_ctx (:obj:`int`, optional, defaults to 1024):
                Dimensionality of the causal mask (usually same as n_positions).
            n_embd (:obj:`int`, optional, defaults to 768):
                Dimensionality of the embeddings and hidden states.
            n_layer (:obj:`int`, optional, defaults to 12):
                Number of hidden layers in the Transformer encoder.
            n_head (:obj:`int`, optional, defaults to 12):
                Number of attention heads for each attention layer in the Transformer encoder.
            activation_function (:obj:`str`, optional, defaults to 'gelu'):
                Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
                The dropout ratio for the embeddings.
            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
                The dropout ratio for the attention.
            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
                The epsilon to use in the layer normalization layers
            initializer_range (:obj:`float`, optional, defaults to 16):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            summary_type (:obj:`string`, optional, defaults to "cls_index"):
                Argument used when doing sequence summary. Used in for the multiple choice head in
                :class:`~transformers.GPT2DoubleHeadsModel`.
                Is one of the following options:

                - 'last' => take the last token hidden state (like XLNet)
                - 'first' => take the first token hidden state (like Bert)
                - 'mean' => take the mean of all tokens hidden states
                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
                - 'attn' => Not implemented now, use multi-head attention
            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
                Argument used when doing sequence summary. Used in for the multiple choice head in
                :class:`~transformers.GPT2DoubleHeadsModel`.
                Add a projection after the vector extraction
            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
                Argument used when doing sequence summary. Used in for the multiple choice head in
                :class:`~transformers.GPT2DoubleHeadsModel`.
                'tanh' => add a tanh activation to the output, Other => no activation.
            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
                Argument used when doing sequence summary. Used in for the multiple choice head in
                :class:`~transformers.GPT2DoubleHeadsModel`.
                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
                Argument used when doing sequence summary. Used in for the multiple choice head in
                :class:`~transformers.GPT2DoubleHeadsModel`.
                Add a dropout before the projection and activation

        Example::

            from transformers import GPT2Model, GPT2Config

            # Initializing a GPT2 configuration
            configuration = GPT2Config()

            # Initializing a model from the configuration
            model = GPT2Model(configuration)

            # Accessing the model configuration
            configuration = model.config

        Attributes:
            pretrained_config_archive_map (Dict[str, str]):
                A dictionary containing all the available pre-trained checkpoints.
    r   éQÄ  é   é   é   Úgelu_newçš™™™™™¹?çñhãˆµøä>ç{®Gáz”?Ú	cls_indexTNéPÄ  c                    sŽ   t ƒ jf ||dœ|—Ž || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S )N)Úbos_token_idÚeos_token_id)ÚsuperÚ__init__Ú
vocab_sizeÚn_ctxÚn_positionsÚn_embdÚn_layerÚn_headÚactivation_functionÚresid_pdropÚ
embd_pdropÚ
attn_pdropÚlayer_norm_epsilonÚinitializer_rangeÚsummary_typeÚsummary_use_projÚsummary_activationÚsummary_first_dropoutÚsummary_proj_to_labelsr   r   )Úselfr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r$   r#   r   r   Úkwargs©Ú	__class__© úC/tmp/pip-unpacked-wheel-ymerj3tt/transformers/configuration_gpt2.pyr   z   s(    zGPT2Config.__init__c                 C   s   | j S ©N)r   ©r%   r)   r)   r*   Úmax_position_embeddings¨   s    z"GPT2Config.max_position_embeddingsc                 C   s   | j S r+   )r   r,   r)   r)   r*   Úhidden_size¬   s    zGPT2Config.hidden_sizec                 C   s   | j S r+   )r   r,   r)   r)   r*   Únum_attention_heads°   s    zGPT2Config.num_attention_headsc                 C   s   | j S r+   )r   r,   r)   r)   r*   Únum_hidden_layers´   s    zGPT2Config.num_hidden_layers)r   r   r   r   r	   r	   r
   r   r   r   r   r   r   TNTr   r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú"GPT2_PRETRAINED_CONFIG_ARCHIVE_MAPZpretrained_config_archive_mapZ
model_typer   Úpropertyr-   r.   r/   r0   Ú__classcell__r)   r)   r'   r*   r   #   s>   S                   ì.


r   )	r4   ÚloggingZconfiguration_utilsr   Ú	getLoggerr1   Úloggerr5   r   r)   r)   r)   r*   Ú<module>   s   
û	