U
    &c=                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZ eeZi ZdZd	Zed
eG dd deZG dd deZedeG dd deZedeG dd de
ZdS )z TF 2.0 Flaubert model.
    N   )FlaubertConfig)add_start_docstrings)TFXLMForSequenceClassificationTFXLMMainLayer
TFXLMModelTFXLMWithLMHeadModel	get_masks
shape_list)BatchEncodingao  

    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
    Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.

    Parameters:
        config (:class:`~transformers.FlaubertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
            `What are attention masks? <../glossary.html#attention-mask>`__
        langs (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            A parallel sequence of tokens to be used to indicate the language of each token in the input.
            Indices are languages ids which can be obtained from the language names by using two conversion mappings
            provided in the configuration of the model (only provided for multilingual models).
            More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
            the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
            See usage examples detailed in the `multilingual documentation <https://huggingface.co/transformers/multilingual.html>`__.
        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Length of each sentence that can be used to avoid performing attention on padding token indices.
            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
            Indices selected in ``[0, ..., input_ids.size(-1)]``:
        cache (:obj:`Dict[str, tf.Tensor]`, `optional`, defaults to :obj:`None`):
            dictionary with ``tf.Tensor`` that contains pre-computed
            hidden-states (key and values in the attention blocks) as computed by the model
            (see `cache` output below). Can be used to speed up sequential decoding.
            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
zbThe bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.c                       s$   e Zd ZeZeZ fddZ  ZS )TFFlaubertModelc                    s&   t  j|f|| t|dd| _d S Ntransformer)namesuper__init__TFFlaubertMainLayerr   selfconfiginputskwargs	__class__ E/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_flaubert.pyr   m   s    zTFFlaubertModel.__init__	__name__
__module____qualname__r   Zconfig_class(TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapr   __classcell__r   r   r   r   r   e   s   r   c                	       s&   e Zd Z fddZdddZ  ZS )r   c                    s4   t  j|f|| t|dd| _t|dd| _d S )N	layerdropg        pre_normF)r   r   getattrr#   r$   r   r   r   r   r   s   s    zTFFlaubertMainLayer.__init__NFc                 C   s  t |ttfr|d }t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkr|d n|}t|dkr|d n|}t|d	kr|d	 n|	}	t|d
kstdnt |ttfr~|d}|d|}|d|}|d|}|d|}|d|}|d|}|d|}|d|	}	t|d
kstdn|}|d k	r|	d k	rtdn@|d k	rt	|\}}n(|	d k	rt	|	d d \}}ntd|d kr.|d k	rt
jt
jt
|| jt
jddd}nt
|g| t
j}t
jt	|d | t||| j|d\}}|d krzt
jt
|dd}nt
jt	|||g |d k	rt
jt	|||g |d k	rtnd g| j }|d k	rd|d k	rd||d  }|d d | d f }|d d | d f }|d k	r8|d d | d f }|d d | d f }|d d | d f }|	d krx| |}	|	| | }|d k	r| jr|| | }|d k	r|| | }| |}| j||
d}||dt
jf  }d}d}t| jD ]d}t !dd}|
r"|| j"k r"q| j#r4||f }| j$s| j%| ||d ||| g|
d}|d }| j&rz||d f }| j||
d}|| }| j'| |}nb| j'| |}| j%| ||d ||| g|
d}|d }| j&r||d f }| j||
d}|| }| j$s,|| j(| | }| j)| |}n | j)| |}|| j(| | }||dt
jf  }q| j#rt||f }|d k	r|d  |*d7  < |f}| j#r||f }| j&r||f }|S )Nr   r                        	   zToo many inputs.	input_idsattention_masklangstoken_type_idsposition_idslengthscache	head_maskinputs_embedszDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embeds)Zdtype)Zaxis)Zpadding_maskslen)training.r   )+
isinstancetuplelistlenAssertionErrordictr   get
ValueErrorr
   tfZ
reduce_sumcast	not_equalZ	pad_indexZint32Zconvert_to_tensor	debuggingZassert_equalr	   ZcausalZexpand_dimsrangeNotImplementedErrorZn_layersZ
embeddingsZposition_embeddingsZuse_lang_embZlang_embeddingsZlayer_norm_embZdropoutZnewaxisrandomuniformr#   Zoutput_hidden_statesr$   
attentionsZoutput_attentionsZlayer_norm1ZffnsZlayer_norm2size)r   r   r/   r0   r1   r2   r3   r4   r5   r6   r8   r.   bsr7   maskZ	attn_maskZ_slenZtensorZhidden_statesrI   iZdropout_probabilityZattn_outputsZattnZtensor_normalizedoutputsr   r   r   callx   s    





&	








  




zTFFlaubertMainLayer.call)	NNNNNNNNF)r   r   r    r   rO   r"   r   r   r   r   r   r   s            r   zThe Flaubert Model transformer with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                       s$   e Zd ZeZeZ fddZ  ZS )TFFlaubertWithLMHeadModelc                    s&   t  j|f|| t|dd| _d S r   r   r   r   r   r   r   :  s    z"TFFlaubertWithLMHeadModel.__init__r   r   r   r   r   rP   1  s   rP   zFlaubert Model with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s$   e Zd ZeZeZ fddZ  ZS )#TFFlaubertForSequenceClassificationc                    s&   t  j|f|| t|dd| _d S r   r   r   r   r   r   r   H  s    z,TFFlaubertForSequenceClassification.__init__r   r   r   r   r   rQ   ?  s   rQ   )__doc__loggingrG   Z
tensorflowrA   Zconfiguration_flaubertr   Z
file_utilsr   Zmodeling_tf_xlmr   r   r   r   r	   r
   Ztokenization_utilsr   	getLoggerr   loggerr!   ZFLAUBERT_START_DOCSTRINGZFLAUBERT_INPUTS_DOCSTRINGr   r   rP   rQ   r   r   r   r   <module>   s8    
1	 @	