U
    &c{                     @   sP  d Z ddlZddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZmZ ddlmZmZmZmZ eeZd	d
iZdd Zejee
dZG dd dejZG dd dejZG dd dejZ G dd deZ!dZ"dZ#ede"G dd de!Z$ede"G dd de!Z%ede"G dd  d e!Z&dS )!zPyTorch OpenAI GPT model.    N)CrossEntropyLoss   )gelu_newswish)OpenAIGPTConfig)add_start_docstrings add_start_docstrings_to_callable)Conv1DPreTrainedModelSequenceSummaryprune_conv1d_layerz
openai-gptz7https://cdn.huggingface.co/openai-gpt-pytorch_model.binc                    s  ddl }ddl dkr$tjtd td ddd}t	
|}W 5 Q R X td	 ddd}t	
|}W 5 Q R X   fd
d|D } fddtdD }	  |	d|dd }	dd t|	|D }	dd |	D }	z8| jjj|	d jkst| jjj|	d jks(tW nb tk
r }
 zB|
 j| jjj|	d jf7  _|
 j| jjj|	d jf7  _ W 5 d}
~
X Y nX t|	d | jj_t|	d | jj_|d |	d |	d t||	D ]\}}|dd }|dd dks
t|dd }|d}| }|D ]}|d|rH|d|}n|g}|d dkrht|d}nB|d dkrt|d}n(|d dkrt|d}nt||d }t|dkr(t|d }|| }q(z|j|jkstW n< tk
r" }
 z|
 j|j|jf7  _ W 5 d}
~
X Y nX z|j|jks8tW n< tk
rv }
 z|
 j|j|jf7  _ W 5 d}
~
X Y nX td| t||_q| S )zM Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
    r   Nz.ckptzLoading weights from {}z/parameters_names.jsonrzutf-8)encodingz/params_shapes.jsonc                    s   g | ]}  |qS  )prod).0shape)npr   @/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_openai.py
<listcomp>6   s     z1load_tf_weights_in_openai_gpt.<locals>.<listcomp>c                    s    g | ]}  d | qS )z/params_{}.npy)loadformat)r   nr   openai_checkpoint_folder_pathr   r   r   7   s     
   c                 S   s   g | ]\}}| |qS r   )Zreshape)r   paramr   r   r   r   r   9   s     c                 S   s   g | ]}|  qS r   )squeeze)r   Zarrr   r   r   r   >   s     r      z:0/z[A-Za-z]+\d+z(\d+)gweightbbiasw   zInitialize PyTorch weight {})reZnumpyospathdirnameloggerinfor   openjsonr   ZcumsumrangesplitZconcatenateziptokens_embedr#   r   AssertionErrorpositions_embedargstorchZ
from_numpydatapop	fullmatchgetattrlenint)modelconfigr   r(   Znames_handlenamesZshapes_handleZshapesoffsetsZinit_paramsenamearraypointerZm_nameZscope_namesnumr   r   r   load_tf_weights_in_openai_gpt'   sv    



rG   )Zrelur   Zgeluc                       sL   e Zd Zd fdd	Zdd ZdddZd	d
 ZdddZdddZ  Z	S )	AttentionFc              	      s   t    |}||j dks t| dtt||dd|| |j| _|| _	|| _
|j| _t|d || _t||| _t|j| _t|j| _t | _d S )Nr   r%   r      )super__init__n_headr4   Zregister_bufferr7   Ztrilonesview
split_sizescaleoutput_attentionsr	   c_attnc_projnnDropoutZ
attn_pdropattn_dropoutresid_pdropresid_dropoutsetpruned_heads)selfnxn_ctxr?   rP   n_state	__class__r   r   rK   x   s    
&zAttention.__init__c                    s  t |dkrd S t| j| j| j }t|| j }|D ](  t fdd| jD 8  d| < q8|d	 
d}tt ||  }t||| j |d| j  g}t| j|dd| _t| j|dd| _| j| j | jt |  | _| jt | | _| j|| _d S )Nr   c                 3   s   | ]}| k rd ndV  qdS )r   r   Nr   )r   hheadr   r   	<genexpr>   s     z(Attention.prune_heads.<locals>.<genexpr>r   r   r'   Zdim)r<   r7   rM   rL   rO   rY   rZ   sumrN   
contiguouseqarangelongcatr   rR   rS   union)r[   headsmaskindexZ
index_attnr   rb   r   prune_heads   s    
 zAttention.prune_headsNc           	      C   s   t ||}| jr&|t|d }| jd d d d d |dd |df }|| dd|   }|d k	rx|| }tjdd|}| 	|}|d k	r|| }t ||g}| j
r|| |S )Nr   r         r   re   )r7   matmulrP   mathsqrtsizer%   rT   ZSoftmaxrV   rQ   append)	r[   qkvattention_mask	head_maskr&   r$   outputsr   r   r   _attn   s    .

zAttention._attnc                 C   sD   | dddd }| d d |d|d f }|j| S )Nr   r'   r   rI   r    r   )permuterg   ru   rN   )r[   xnew_x_shaper   r   r   merge_heads   s    &zAttention.merge_headsc                 C   sX   |  d d | j| d| j f }|j| }|rD|ddddS |ddddS d S )Nr   r   r'   rI   r   )ru   rL   rN   r~   )r[   r   rx   r   r   r   r   split_heads   s
    &
zAttention.split_headsc           
      C   s   |  |}|j| jdd\}}}| |}| j|dd}| |}| |||||}|d }| |}| |}| |}|g|dd   }	|	S )Nr'   re   T)rx   r   r   )rR   r1   rO   r   r}   r   rS   rX   )
r[   r   rz   r{   querykeyvalueattn_outputsar|   r   r   r   forward   s    





zAttention.forward)F)NN)F)NN)
__name__
__module____qualname__rK   rp   r}   r   r   r   __classcell__r   r   r_   r   rH   w   s   

rH   c                       s$   e Zd Z fddZdd Z  ZS )MLPc                    sF   t    |j}t||| _t||| _t|j | _t	
|j| _d S N)rJ   rK   n_embdr	   c_fcrS   ACT_FNSZafnactrT   rU   rW   dropout)r[   r^   r?   r\   r_   r   r   rK      s    
zMLP.__init__c                 C   s$   |  | |}| |}| |S r   )r   r   rS   r   )r[   r   ra   h2r   r   r   r      s    
zMLP.forwardr   r   r   rK   r   r   r   r   r_   r   r      s   r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	BlockFc                    sX   t    |j}t||||| _tj||jd| _t	d| || _
tj||jd| _d S )N)Zeps   )rJ   rK   r   rH   attnrT   	LayerNormZlayer_norm_epsilonln_1r   mlpln_2)r[   r]   r?   rP   r\   r_   r   r   rK      s    
zBlock.__init__Nc           
      C   sT   | j |||d}|d }| || }| |}| || }|g|dd   }	|	S )N)rz   r{   r   r   )r   r   r   r   )
r[   r   rz   r{   r   r   r   mra   r|   r   r   r   r      s    
zBlock.forward)F)NNr   r   r   r_   r   r      s   r   c                   @   s(   e Zd ZdZeZeZeZ	dZ
dd ZdS )OpenAIGPTPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    transformerc                 C   s|   t |tjtjtfrR|jjjd| jj	d t |tjtfrx|j
dk	rx|j
j  n&t |tjrx|j
j  |jjd dS )z! Initialize the weights.
        g        )ZmeanZstdN      ?)
isinstancerT   Linear	Embeddingr	   r#   r8   Znormal_r?   Zinitializer_ranger%   Zzero_r   Zfill_)r[   moduler   r   r   _init_weights  s    z&OpenAIGPTPreTrainedModel._init_weightsN)r   r   r   __doc__r   Zconfig_class'OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_maprG   Zload_tf_weightsZbase_model_prefixr   r   r   r   r   r      s   r   as  

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a	  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.OpenAIGPTTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
zdThe bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.c                       sF   e Zd Z fddZdd Zdd Zdd Zeedd
dZ	  Z
S )OpenAIGPTModelc                    sz   t     j| _ j| _t j j| _t j	 j| _
t j| _t fddt jD | _|   d S )Nc                    s   g | ]}t  j d dqS )T)rP   )r   r]   )r   _r?   r   r   r   P  s     z+OpenAIGPTModel.__init__.<locals>.<listcomp>)rJ   rK   rQ   output_hidden_statesrT   r   
vocab_sizer   r3   Zn_positionsr5   rU   Z
embd_pdropdropZ
ModuleListr0   n_layerra   init_weightsr[   r?   r_   r   r   rK   H  s     zOpenAIGPTModel.__init__c                 C   s   | j S r   r3   r[   r   r   r   get_input_embeddingsT  s    z#OpenAIGPTModel.get_input_embeddingsc                 C   s
   || _ d S r   r   )r[   Znew_embeddingsr   r   r   set_input_embeddingsW  s    z#OpenAIGPTModel.set_input_embeddingsc                 C   s(   |  D ]\}}| j| j| qdS )zz Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsra   r   rp   )r[   Zheads_to_pruneZlayerrm   r   r   r   _prune_headsZ  s    zOpenAIGPTModel._prune_headsNc                 C   s  |dk	r|dk	rt dnD|dk	r<| }|d|d }n"|dk	rV| dd }nt d|dkr|dk	rt|jn|j}tj|d tj|d}|dd|d }|dk	r|dd}|jt	| 
 jd	}d
| d }| || jj}|dkr| |}| |}	|dk	r6|d|d}| |}
nd}
||	 |
 }| |}||df }d}d}t| jD ]P\}}| jr||j| f }||||| }|d }| jrr||d f }qr| jr||j| f }|j| f}| jr||f }| jr||f }|S )a  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
        import torch

        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        model = OpenAIGPTModel.from_pretrained('openai-gpt')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)dtypedevicer   r   r'   )r   r   rq   r   )
ValueErrorru   rN   r   r7   ri   rj   Z	unsqueezetonext
parametersr   Zget_head_maskr?   r   r3   r5   r   	enumeratera   r   rQ   )r[   	input_idsrz   token_type_idsposition_idsr{   inputs_embedsZinput_shaper   Zposition_embedsZtoken_type_embedshidden_statesZoutput_shapeZall_attentionsZall_hidden_statesiblockr|   r   r   r   r   a  sX    '







zOpenAIGPTModel.forward)NNNNNN)r   r   r   rK   r   r   r   r   OPENAI_GPT_INPUTS_DOCSTRINGr   r   r   r   r_   r   r   C  s         r   zOpenAI GPT Model transformer with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                       s6   e Zd Z fddZdd ZeedddZ  ZS )	OpenAIGPTLMHeadModelc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr%   )
rJ   rK   r   r   rT   r   r   r   lm_headr   r   r_   r   r   rK     s    
zOpenAIGPTLMHeadModel.__init__c                 C   s   | j S r   r   r   r   r   r   get_output_embeddings  s    z*OpenAIGPTLMHeadModel.get_output_embeddingsNc                 C   s   | j ||||||d}|d }	| |	}
|
f|dd  }|dk	r|
dddddf  }|dddf  }t }||d|d|d}|f| }|S )a
  
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored (masked), the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
            Language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
        import torch

        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=input_ids)
        loss, logits = outputs[:2]

    rz   r   r   r{   r   r   r   N.r   )r   r   rg   r   rN   ru   )r[   r   rz   r   r   r{   r   labelstransformer_outputsr   	lm_logitsr|   shift_logitsshift_labelsloss_fctlossr   r   r   r     s$    5

zOpenAIGPTLMHeadModel.forward)NNNNNNN	r   r   r   rK   r   r   r   r   r   r   r   r_   r   r     s          r   at  OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
    head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
    The language modeling head has its weights tied to the input embeddings,
    the classification head takes as input the input of a specified classification token index in the input sequence).
c                
       s6   e Zd Z fddZdd ZeedddZ  ZS )	OpenAIGPTDoubleHeadsModelc                    sH   t  | d|_t|| _tj|j|jdd| _	t
|| _|   d S )Nr   Fr   )rJ   rK   Z
num_labelsr   r   rT   r   r   r   r   r   multiple_choice_headr   r   r_   r   r   rK   6  s    

z"OpenAIGPTDoubleHeadsModel.__init__c                 C   s   | j S r   r   r   r   r   r   r   @  s    z/OpenAIGPTDoubleHeadsModel.get_output_embeddingsNc
                 C   s   | j ||||||d}
|
d }| |}| ||d}||f|
dd  }|	dk	rt }||d|d|	d}|f| }|dk	r|dddddf  }|dddf  }t }||d|d|d}|f| }|S )a  
        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
            Index of the classification token in each input sequence.
            Selected in the range ``[0, input_ids.size(-1) - 1[``.
        lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored (masked), the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``
        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
        lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided):
            Language modeling loss.
        mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided):
            Multiple choice classification loss.
        lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mc_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
        import torch

        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
        tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
        model.resize_token_embeddings(len(tokenizer))

        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
        mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0)  # Batch size 1

        outputs = model(input_ids, mc_token_ids=mc_token_ids)
        lm_prediction_scores, mc_prediction_scores = outputs[:2]

    r   r   r   r   N.)r   r   r   r   r   rN   ru   rg   )r[   r   rz   r   r   r{   r   Zmc_token_idsZ	lm_labelsZ	mc_labelsr   r   r   Z	mc_logitsr|   r   r   r   r   r   r   r   r   C  s.    H


z!OpenAIGPTDoubleHeadsModel.forward)	NNNNNNNNNr   r   r   r_   r   r   -  s   	
         r   )'r   r/   loggingrs   r)   r7   Ztorch.nnrT   r   Zactivationsr   r   Zconfiguration_openair   Z
file_utilsr   r   Zmodeling_utilsr	   r
   r   r   	getLoggerr   r,   r   rG   ZReLUr   ModulerH   r   r   r   ZOPENAI_GPT_START_DOCSTRINGr   r   r   r   r   r   r   r   <module>   sJ   
M^& 
X