U
    &ºc"  ã                   @   s>   d Z ddlZddlmZ e e¡ZddiZG dd„ deƒZdS )	z Transformer XL configuration é    Né   )ÚPretrainedConfigztransfo-xl-wt103zPhttps://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.jsonc                       s´   e Zd ZdZeZdZddddgdddd	d
ddddddddddddddddddddddf‡ fdd„	Zedd„ ƒZ	edd„ ƒZ
e
jd d„ ƒZ
ed!d"„ ƒZed#d$„ ƒZed%d&„ ƒZ‡  ZS )'ÚTransfoXLConfiga™  
        This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
        It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
        the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.

        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
        for more information.

        Args:
            vocab_size (:obj:`int`, optional, defaults to 267735):
                Vocabulary size of the Transformer XL model. Defines the different tokens that
                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
            cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
                Cutoffs for the adaptive softmax
            d_model (:obj:`int`, optional, defaults to 1024):
                Dimensionality of the model's hidden states.
            d_embed (:obj:`int`, optional, defaults to 1024):
                Dimensionality of the embeddings
            n_head (:obj:`int`, optional, defaults to 16):
                Number of attention heads for each attention layer in the Transformer encoder.
            d_head (:obj:`int`, optional, defaults to 64):
                Dimensionality of the model's heads.
            d_inner (:obj:`int`, optional, defaults to 4096):
                Inner dimension in FF
            div_val (:obj:`int`, optional, defaults to 4):
                Divident value for adapative input and softmax
            pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
                Apply LayerNorm to the input instead of the output
            n_layer (:obj:`int`, optional, defaults to 18):
                Number of hidden layers in the Transformer encoder.
            tgt_len (:obj:`int`, optional, defaults to 128):
                Number of tokens to predict
            ext_len (:obj:`int`, optional, defaults to 0):
                Length of the extended context
            mem_len (:obj:`int`, optional, defaults to 1600):
                Length of the retained previous heads
            clamp_len (:obj:`int`, optional, defaults to 1000):
                use the same pos embeddings after clamp_len
            same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
                Use the same attn length for all tokens
            proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
                True to share all but first projs, False not to share.
            attn_type (:obj:`int`, optional, defaults to 0):
                Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
            sample_softmax (:obj:`int`, optional, defaults to -1):
                number of samples in sampled softmax
            adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
                use adaptive softmax
            tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`):
                tie the word embedding and softmax weights
            dropout (:obj:`float`, optional, defaults to 0.1):
                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
            dropatt (:obj:`float`, optional, defaults to 0):
                The dropout ratio for the attention probabilities.
            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
                Untie relative position biases
            init (:obj:`string`, optional, defaults to `normal`):
                Parameter initializer to use
            init_range (:obj:`float`, optional, defaults to 0.01):
                Parameters initialized by U(-init_range, init_range).
            proj_init_std (:obj:`float`, optional, defaults to 0.01):
                Parameters initialized by N(0, init_std)
            init_std (:obj:`float`, optional, defaults to 0.02):
                Parameters initialized by N(0, init_std)
            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
                The epsilon to use in the layer normalization layers

        Example::

            from transformers import TransfoXLConfig, TransfoXLModel

            # Initializing a Transformer XL configuration
            configuration = TransfoXLConfig()

            # Initializing a model from the configuration
            model = TransfoXLModel(configuration)

            # Accessing the model configuration
            configuration = model.config

        Attributes:
            pretrained_config_archive_map (Dict[str, str]):
                A dictionary containing all the available pre-trained checkpoints.
    z
transfo-xli× i N  i@œ  i@ i   é   é@   i   é   Fé   é€   r   i@  iè  Téÿÿÿÿgš™™™™™¹?g        Únormalg{®Gáz„?g{®Gáz”?gñhãˆµøä>c                    sþ   t ƒ jf d|i|—Ž || _g | _| j |¡ || _|rRdgdgt| jƒ  | _ndgdgt| jƒ  | _|| _|| _	|| _
|| _|| _|	| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S )NÚeos_token_idFT) ÚsuperÚ__init__Ú
vocab_sizeÚcutoffsÚextendÚ
tie_weightÚlenZ	tie_projsÚd_modelÚd_embedÚd_headÚd_innerÚdiv_valÚ	pre_lnormÚn_layerÚn_headÚtgt_lenÚext_lenÚmem_lenÚsame_lengthÚ	attn_typeÚ	clamp_lenÚsample_softmaxÚadaptiveÚdropoutÚdropattÚuntie_rÚinitÚ
init_rangeÚproj_init_stdÚinit_stdÚlayer_norm_epsilon)Úselfr   r   r   r   r   r   r   r   r   r   r   r   r   r!   r   Zproj_share_all_but_firstr    r"   r#   r   r$   r%   r&   r'   r(   r)   r*   r+   r   Úkwargs©Ú	__class__© úI/tmp/pip-unpacked-wheel-ymerj3tt/transformers/configuration_transfo_xl.pyr   z   s@    !zTransfoXLConfig.__init__c                 C   s   | j | j | j S ©N)r   r   r   ©r,   r0   r0   r1   Úmax_position_embeddings¾   s    z'TransfoXLConfig.max_position_embeddingsc                 C   s   | j S r2   ©r   r3   r0   r0   r1   Ún_tokenÂ   s    zTransfoXLConfig.n_tokenc                 C   s
   || _ d S r2   r5   )r,   Úvaluer0   r0   r1   r6   Æ   s    c                 C   s   | j S r2   )r   r3   r0   r0   r1   Úhidden_sizeÊ   s    zTransfoXLConfig.hidden_sizec                 C   s   | j S r2   )r   r3   r0   r0   r1   Únum_attention_headsÎ   s    z#TransfoXLConfig.num_attention_headsc                 C   s   | j S r2   )r   r3   r0   r0   r1   Únum_hidden_layersÒ   s    z!TransfoXLConfig.num_hidden_layers)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú(TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAPZpretrained_config_archive_mapZ
model_typer   Úpropertyr4   r6   Úsetterr8   r9   r:   Ú__classcell__r0   r0   r.   r1   r      sZ   WâD




r   )	r>   ÚloggingZconfiguration_utilsr   Ú	getLoggerr;   Úloggerr?   r   r0   r0   r0   r1   Ú<module>   s   
 ÿ