U
    &cA                     @   s
  d Z ddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZmZmZmZmZmZ eeZdd	d
ddZdZdZe
deG dd deZe
deG dd deZe
deG dd deZe
deG dd deZe
deG dd deZdS )z' PyTorch Flaubert model, based on XLM.     N)
functional   )FlaubertConfig)add_start_docstrings add_start_docstrings_to_callable)XLMForQuestionAnsweringXLMForQuestionAnsweringSimpleXLMForSequenceClassificationXLMModelXLMWithLMHeadModel	get_maskszJhttps://cdn.huggingface.co/flaubert/flaubert_small_cased/pytorch_model.binzKhttps://cdn.huggingface.co/flaubert/flaubert_base_uncased/pytorch_model.binzIhttps://cdn.huggingface.co/flaubert/flaubert_base_cased/pytorch_model.binzJhttps://cdn.huggingface.co/flaubert/flaubert_large_cased/pytorch_model.bin)zflaubert-small-casedzflaubert-base-uncasedzflaubert-base-casedzflaubert-large-casedar  

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config (:class:`~transformers.FlaubertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Length of each sentence that can be used to avoid performing attention on padding token indices.
            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
            Indices selected in ``[0, ..., input_ids.size(-1)]``:
        cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`, defaults to :obj:`None`):
            dictionary with ``torch.FloatTensor`` that contains pre-computed
            hidden-states (key and values in the attention blocks) as computed by the model
            (see `cache` output below). Can be used to speed up sequential decoding.
            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
zbThe bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.c                
       s6   e Zd ZeZeZ fddZee	dddZ
  ZS )FlaubertModelc                    s,   t  | t|dd| _t|dd| _d S )N	layerdropg        pre_normF)super__init__getattrr   r   selfconfig	__class__ B/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_flaubert.pyr   r   s    zFlaubertModel.__init__Nc
                 C   s@  |dk	r|  \}
}n|	  dd \}
}|dkrb|dk	rR|| jkjdd }nt|g|
 }| d|
kstt|  |kstt	||| j
|d\}}|dk	r|jn|	j}|dkrtj|tj|d}|d|
|f}n|  |
|fkst|dk	r|  |
|fkst| || jj}|dk	r|dk	r||d  }|dd| df }|dd| df }|dk	r|dd| df }|dd| df }|dd| df }|	dkr| |}	|	| ||	 }|dk	r| jr| jjdkr|| | }|dk	r,|| | }| |}tj|| j| jd	}||d|j9 }d
}d
}t| jD ]j}t dd}| jr|| j!k rqr| j"r||f }| j#s| j$| ||||| d}|d }| j%r||d f }tj|| j| jd	}|| }| j&| |}nb| j&| |}| j$| ||||| d}|d }| j%r`||d f }tj|| j| jd	}|| }| j#s|| j'| | }| j(| |}n | j(| |}|| j'| | }||d|j9 }qr| j"r||f }|dk	r|d  | d7  < |f}| j"r*||f }| j%r<||f }|S )a  
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import FlaubertTokenizer, FlaubertModel
        import torch

        tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased')
        model = FlaubertModel.from_pretrained('flaubert-base-cased')
        input_ids = torch.tensor(tokenizer.encode("Le chat mange une pomme.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        Nr   )Zdimr   )Zpadding_mask)dtypedeviceslen)ptrainingr   )cache	head_mask))sizeZ	pad_indexsumlongtorchZ
LongTensorAssertionErrormaxitemr   Zcausalr   ZarangeZ	unsqueezeexpandZget_head_maskr   Zn_layersZ
embeddingsZposition_embeddingsZ	expand_asZuse_lang_embZn_langsZlang_embeddingsZlayer_norm_embFZdropoutr   tor   rangerandomuniformr   Zoutput_hidden_statesr   
attentionsZoutput_attentionsZlayer_norm1ZffnsZlayer_norm2)r   Z	input_idsZattention_maskZlangsZtoken_type_idsZposition_idslengthsr    r!   Zinputs_embedsbsr   maskZ	attn_maskr   Z_slenZtensorZhidden_statesr/   iZdropout_probabilityZattn_outputsZattnZtensor_normalizedoutputsr   r   r   forwardw   s    +



 







zFlaubertModel.forward)	NNNNNNNNN)__name__
__module____qualname__r   config_class%FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAPpretrained_model_archive_mapr   r   FLAUBERT_INPUTS_DOCSTRINGr5   __classcell__r   r   r   r   r   i   s            r   zThe Flaubert Model transformer with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                       s(   e Zd ZdZeZeZ fddZ  Z	S )FlaubertWithLMHeadModelz
    This class overrides :class:`~transformers.XLMWithLMHeadModel`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    c                    s"   t  | t|| _|   d S Nr   r   r   ZtransformerZinit_weightsr   r   r   r   r   5  s    
z FlaubertWithLMHeadModel.__init__
r6   r7   r8   __doc__r   r9   r:   r;   r   r=   r   r   r   r   r>   '  s   r>   zFlaubert Model with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s(   e Zd ZdZeZeZ fddZ  Z	S )!FlaubertForSequenceClassificationz
    This class overrides :class:`~transformers.XLMForSequenceClassification`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    c                    s"   t  | t|| _|   d S r?   r@   r   r   r   r   r   I  s    
z*FlaubertForSequenceClassification.__init__rA   r   r   r   r   rC   ;  s   rC   zFlaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s(   e Zd ZdZeZeZ fddZ  Z	S )"FlaubertForQuestionAnsweringSimplez
    This class overrides :class:`~transformers.XLMForQuestionAnsweringSimple`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    c                    s"   t  | t|| _|   d S r?   r@   r   r   r   r   r   ]  s    
z+FlaubertForQuestionAnsweringSimple.__init__rA   r   r   r   r   rD   O  s   rD   zFlaubert Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s(   e Zd ZdZeZeZ fddZ  Z	S )FlaubertForQuestionAnsweringz
    This class overrides :class:`~transformers.XLMForQuestionAnswering`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    c                    s"   t  | t|| _|   d S r?   r@   r   r   r   r   r   q  s    
z%FlaubertForQuestionAnswering.__init__rA   r   r   r   r   rE   c  s   rE   )rB   loggingr-   r%   Ztorch.nnr   r*   Zconfiguration_flaubertr   Z
file_utilsr   r   Zmodeling_xlmr   r   r	   r
   r   r   	getLoggerr6   loggerr:   ZFLAUBERT_START_DOCSTRINGr<   r   r>   rC   rD   rE   r   r   r   r   <module>   sR    

/ ;