U
    &cf                      @   s>   d Z ddlZddlmZ eeZddiZG dd deZdS )	z OpenAI GPT configuration     N   )PretrainedConfig
openai-gptzJhttps://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.jsonc                       sZ   e Zd ZdZeZdZd fdd	Zedd Z	edd Z
edd Zedd Z  ZS )OpenAIGPTConfiga  
        This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
        It is used to instantiate an GPT model according to the specified arguments, defining the model
        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
        the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.

        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
        for more information.

        Args:
            vocab_size (:obj:`int`, optional, defaults to 40478):
                Vocabulary size of the GPT model. Defines the different tokens that
                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
            n_positions (:obj:`int`, optional, defaults to 512):
                The maximum sequence length that this model might ever be used with.
                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
            n_ctx (:obj:`int`, optional, defaults to 512):
                Dimensionality of the causal mask (usually same as n_positions).
            n_embd (:obj:`int`, optional, defaults to 768):
                Dimensionality of the embeddings and hidden states.
            n_layer (:obj:`int`, optional, defaults to 12):
                Number of hidden layers in the Transformer encoder.
            n_head (:obj:`int`, optional, defaults to 12):
                Number of attention heads for each attention layer in the Transformer encoder.
            afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
                The non-linear activation function (function or string) in the encoder and pooler.
                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
                The dropout ratio for the embeddings.
            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
                The dropout ratio for the attention.
            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
                The epsilon to use in the layer normalization layers
            initializer_range (:obj:`float`, optional, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
                Whether special tokens should be predicted when the model is has a language modeling head.
            summary_type (:obj:`string`, optional, defaults to "cls_index"):
                Argument used when doing sequence summary. Used in for the multiple choice head in
                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
                Is one of the following options:

                - 'last' => take the last token hidden state (like XLNet)
                - 'first' => take the first token hidden state (like Bert)
                - 'mean' => take the mean of all tokens hidden states
                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
                - 'attn' => Not implemented now, use multi-head attention
            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
                Argument used when doing sequence summary. Used in for the multiple choice head in
                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
                Add a projection after the vector extraction
            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
                Argument used when doing sequence summary. Used in for the multiple choice head in
                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
                'tanh' => add a tanh activation to the output, Other => no activation.
            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
                Argument used when doing sequence summary. Used in for the multiple choice head in
                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
                Argument used when doing sequence summary. Used in for the multiple choice head in
                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
                Add a dropout before the projection and activation

        Example::

            from transformers import OpenAIGPTConfig, OpenAIGPTModel

            # Initializing a GPT configuration
            configuration = OpenAIGPTConfig()

            # Initializing a model from the configuration
            model = OpenAIGPTModel(configuration)

            # Accessing the model configuration
            configuration = model.config

        Attributes:
            pretrained_config_archive_map (Dict[str, str]):
                A dictionary containing all the available pre-trained checkpoints.
    r              gelu皙?h㈵>{Gz?T	cls_indexNc                    s~   t  jf | || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|| _|| _|| _|| _|| _d S N)super__init__
vocab_sizen_ctxn_positionsn_embdn_layern_headafnresid_pdrop
embd_pdrop
attn_pdroplayer_norm_epsiloninitializer_rangepredict_special_tokenssummary_typesummary_use_projsummary_activationsummary_first_dropoutsummary_proj_to_labels)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r#   r"   kwargs	__class__ E/tmp/pip-unpacked-wheel-ymerj3tt/transformers/configuration_openai.pyr   x   s&    zOpenAIGPTConfig.__init__c                 C   s   | j S r   )r   r$   r(   r(   r)   max_position_embeddings   s    z'OpenAIGPTConfig.max_position_embeddingsc                 C   s   | j S r   )r   r*   r(   r(   r)   hidden_size   s    zOpenAIGPTConfig.hidden_sizec                 C   s   | j S r   )r   r*   r(   r(   r)   num_attention_heads   s    z#OpenAIGPTConfig.num_attention_headsc                 C   s   | j S r   )r   r*   r(   r(   r)   num_hidden_layers   s    z!OpenAIGPTConfig.num_hidden_layers)r   r   r   r   r	   r	   r
   r   r   r   r   r   Tr   TNTr   )__name__
__module____qualname____doc__(OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAPZpretrained_config_archive_mapZ
model_typer   propertyr+   r,   r-   r.   __classcell__r(   r(   r&   r)   r      s<   U                  +


r   )	r2   loggingZconfiguration_utilsr   	getLoggerr/   loggerr3   r   r(   r(   r(   r)   <module>   s   
 