U
    &cݙ                     @   s  d Z ddlZddlZddlZddlZddlZddlm	Z	 ddl
mZmZ ddlmZmZmZmZmZ ddlmZ eeZdd	d
dddddddd
Zdd Zdd ZdejfddZG dd dejjjZ G dd dejjjZ!G dd dejjjZ"G dd  d eZ#d!Z$d"Z%ed#e$G d$d% d%e#Z&G d&d' d'ejjjZ'ed(e$G d)d* d*e#Z(ed+e$G d,d- d-e#Z)ed.e$G d/d0 d0e#Z*dS )1z TF 2.0 XLM model.
    N   )	XLMConfig)add_start_docstrings add_start_docstrings_to_callable)TFPreTrainedModelTFSequenceSummaryTFSharedEmbeddingsget_initializer
shape_list)BatchEncodingz6https://cdn.huggingface.co/xlm-mlm-en-2048-tf_model.h5z8https://cdn.huggingface.co/xlm-mlm-ende-1024-tf_model.h5z8https://cdn.huggingface.co/xlm-mlm-enfr-1024-tf_model.h5z8https://cdn.huggingface.co/xlm-mlm-enro-1024-tf_model.h5z>https://cdn.huggingface.co/xlm-mlm-tlm-xnli15-1024-tf_model.h5z:https://cdn.huggingface.co/xlm-mlm-xnli15-1024-tf_model.h5z8https://cdn.huggingface.co/xlm-clm-enfr-1024-tf_model.h5z8https://cdn.huggingface.co/xlm-clm-ende-1024-tf_model.h5z6https://cdn.huggingface.co/xlm-mlm-17-1280-tf_model.h5z7https://cdn.huggingface.co/xlm-mlm-100-1280-tf_model.h5)
zxlm-mlm-en-2048zxlm-mlm-ende-1024zxlm-mlm-enfr-1024zxlm-mlm-enro-1024zxlm-mlm-tlm-xnli15-1024zxlm-mlm-xnli15-1024zxlm-clm-enfr-1024zxlm-clm-ende-1024zxlm-mlm-17-1280zxlm-mlm-100-1280c              	      s   t  fddt| D }tt |d d dd df |d d dd df< tt |d d dd df |d d dd df< d S )Nc                    s$   g | ]  fd dt D qS )c              	      s(   g | ] }t d d|d     qS )i'     )nppower).0j)dimpos @/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_xlm.py
<listcomp>1   s     z;create_sinusoidal_embeddings.<locals>.<listcomp>.<listcomp>)range)r   r   )r   r   r   1   s     z0create_sinusoidal_embeddings.<locals>.<listcomp>r   r   r   )r   arrayr   tfconstantsincos)Zn_posr   outZposition_encr   r   r   create_sinusoidal_embeddings0   s    4r   c                 C   s(   ddt j| t jd   }| | S )a   Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    g      ?      ?g       @)r   matherfsqrt)xZcdfr   r   r   gelu6   s     r$   c           	   	   C   s   t |d }|dk	r|}n&t| }tj||ddtjf }|rtt|tjtjddf || df|tjddtjf }n|}tj	t ||| g |dkst ||| | gkst
tj||d}tj||d}||fS )zH
    Generate hidden states mask, and optionally an attention mask.
    r   Nr   Fdtype)r
   r   r   r    ZlessnewaxisZ
less_equalZtile	debuggingassert_equalAssertionErrorcast)	slenlengthscausalpadding_maskr&   bsmaskalen	attn_maskr   r   r   	get_masksA   s     
" r4   c                       s6   e Zd Ze Z fddZdd ZdddZ  Z	S )	TFMultiHeadAttentionc                    s   t  jf | ttj| _|j| _|| _|| _| j| j dksBt	t
jjj|t|jdd| _t
jjj|t|jdd| _t
jjj|t|jdd| _t
jjj|t|jdd| _t
jj|j| _t | _d S )Nr   q_linZkernel_initializernamek_linv_linout_lin)super__init__nextr5   NEW_IDlayer_idoutput_attentionsr   n_headsr*   r   keraslayersDenser	   init_stdr6   r9   r:   r;   Dropoutattention_dropoutdropoutsetpruned_heads)selfrB   r   configkwargs	__class__r   r   r=   d   s    zTFMultiHeadAttention.__init__c                 C   s   t d S NNotImplementedError)rL   headsr   r   r   prune_headss   s    z TFMultiHeadAttention.prune_headsFc                    s  |\}}}}}t |\ }}	|dkr>|dkr0|n
|d | }
nt |d }
j}j| tt |dkrv d||
fn
 dd|
f} fdd} fdd}||}|dkr||}||}n8|dksj|kr| }}||}||}|dk	r~j|krp|dkrb|j \}}tj	||gd	d
}tj	||gd	d
}n|j \}}||f|j< |t
 }tj||dd}t||}|dd|   }tjj|dd
}j||d}|dk	r|| }t||}||}|f}jr||f }|S )zd
        Self-attention (if kv is None) or attention over source sentence (provided by kv).
        Nr,   r      c                    s    t jt |  djfddS )z  projection r   r   r   rV   perm)r   	transposereshaperB   r#   r0   Zdim_per_headrL   r   r   shape   s    z(TFMultiHeadAttention.call.<locals>.shapec                    s"   t t j| dd dj fS )z  compute context rX   rY   rW   )r   r\   r[   rB   r]   r^   r   r   unshape   s    z*TFMultiHeadAttention.call.<locals>.unshaper   ZaxisT)Ztranspose_bgꌠ9Y>)Fr   rW   training)r
   rB   r   lenr6   r9   r:   r@   r   concatr    r"   matmulr\   nnZsoftmaxrI   r;   rA   )rL   inputsrc   inputr1   kvcache	head_maskqlenr   ZklenrB   Zmask_reshaper_   r`   qkvZk_Zv_Zscoresweightscontextoutputsr   r^   r   callv   sP    
(



zTFMultiHeadAttention.call)F)
__name__
__module____qualname__	itertoolscountr?   r=   rU   rt   __classcell__r   r   rO   r   r5   `   s   r5   c                       s&   e Zd Z fddZdddZ  ZS )TFTransformerFFNc                    s|   t  jf | tjjj|t|jdd| _tjjj|t|jdd| _	|j
rZtjjtntjjj| _tjj|j| _d S )Nlin1r7   lin2)r<   r=   r   rC   rD   rE   r	   rF   r|   r}   Zgelu_activationZ
Activationr$   ZactivationsZreluactrG   rI   )rL   Zin_dimZ
dim_hiddenZout_dimrM   rN   rO   r   r   r=      s
     zTFTransformerFFN.__init__Fc                 C   s0   |  |}| |}| |}| j||d}|S )Nrb   )r|   r~   r}   rI   )rL   ri   rc   r#   r   r   r   rt      s
    


zTFTransformerFFN.call)F)ru   rv   rw   r=   rt   rz   r   r   rO   r   r{      s   r{   c                	       s>   e Zd Z fddZdd Zdd Zdd ZdddZ  ZS )TFXLMMainLayerc                    s|  t  jf | |j| _|j| _|j| _|j | _| jr>td|j| _|j| _|j	| _	|j
| _
|j| _|j| _|j| _| jd | _|j| _|j| _| j| j dkstdtjj|j| _tjj|j| _tjjj|j| jt|jdd| _|jrt|jdkr0|j	r0tjjj| j| jt|jdd| _t | j
| j|jd	d
| _!tjjj"|j#dd| _$g | _%g | _&g | _'g | _(t)| jD ]}| j%*t+| j| j|d,|d | j&*tjjj"|j#d,|d | j'*t-| j| j| j|d,|d | j(*tjjj"|j#d,|d qt.|drx|j/0 1 }i |_/|D ]>\}}| j%t2| j|jkr8| 3t2|t4t5t2|i q8d S )Nz,Currently XLM can only be used as an encoder   r   z-transformer dim must be a multiple of n_headsposition_embeddings)Zembeddings_initializerr8   r   lang_embeddings
embeddingsZinitializer_ranger8   layer_norm_emb)epsilonr8   zattentions_._{})rM   r8   zlayer_norm1_._{}z	ffns_._{}zlayer_norm2_._{}rK   )6r<   r=   rA   output_hidden_statesZ
is_encoderZ
is_decoderrS   r.   n_langsuse_lang_embn_wordsZ	eos_index	pad_indexZemb_dimr   Z
hidden_dimrB   n_layersr*   r   rC   rD   rG   rI   rH   Z	EmbeddingZmax_position_embeddingsr	   Zembed_init_stdr   Zsinusoidal_embeddingsr   r   r   ZLayerNormalizationZlayer_norm_epsr   
attentionslayer_norm1ffnslayer_norm2r   appendr5   formatr{   hasattrrK   copyitemsintrU   listmap)rL   rM   rN   irK   ZlayerrT   rO   r   r   r=      s    
   zTFXLMMainLayer.__init__c                 C   s   | j S rQ   )r   rL   r   r   r   get_input_embeddings#  s    z#TFXLMMainLayer.get_input_embeddingsc                 C   s   t d S rQ   rR   )rL   Znew_num_tokensr   r   r   _resize_token_embeddings&  s    z'TFXLMMainLayer._resize_token_embeddingsc                 C   s   t dS )z Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        NrR   )rL   Zheads_to_pruner   r   r   _prune_heads)  s    zTFXLMMainLayer._prune_headsNFc                 C   s  t |ttfr|d }t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkr|d n|}t|dkr|d n|}t|d	kr|d	 n|	}	t|d
kstdnt |ttfr~|d}|d|}|d|}|d|}|d|}|d|}|d|}|d|}|d|	}	t|d
kstdn|}|d k	r|	d k	rtdn@|d k	rt	|\}}n(|	d k	rt	|	d d \}}ntd|d kr.|d k	rt
jt
jt
|| jt
jddd}nt
|g| t
j}t
jt	|d | t||| j|d\}}|d krzt
jt
|dd}nt
jt	|||g |d k	rt
jt	|||g |d k	rtnd g| j }|d k	rd|d k	rd||d  }|d d | d f }|d d | d f }|d k	r8|d d | d f }|d d | d f }|d d | d f }|	d krx| |}	|	| | }|d k	r| jr| jdkr|| | }|d k	r|| | }| |}| j||
d}||dt
j f  }d}d}t| jD ]}| j!r||f }| j"| ||d ||| g|
d}|d }| j#rZ||d f }| j||
d}|| }| j$| |}|| j%| | }| j&| |}||dt
j f  }q| j!r||f }|d k	r|d  |'d7  < |f}| j!r||f }| j#r||f }|S )Nr   r   r   rV   r               	   zToo many inputs.	input_idsattention_masklangstoken_type_idsposition_idsr-   rk   rl   inputs_embedszDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr%   ra   )r/   r,   rb   .r   )(
isinstancetupler   rd   r*   dictr   get
ValueErrorr
   r   Z
reduce_sumr+   	not_equalr   int32Zconvert_to_tensorr(   r)   r4   r.   Zexpand_dimsr   rS   r   r   r   r   r   r   r   rI   r'   r   r   rA   r   r   r   size)rL   rh   r   r   r   r   r-   rk   rl   r   rc   r   r0   r,   r1   r3   Z_slenZtensorhidden_statesr   r   Zattn_outputsZattnrs   r   r   r   rt   0  s    





&	








 




zTFXLMMainLayer.call)	NNNNNNNNF)	ru   rv   rw   r=   r   r   r   rt   rz   r   r   rO   r   r      s   Z
         r   c                   @   s(   e Zd ZdZeZeZdZe	dd Z
dS )TFXLMPreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    transformerc              	   C   s   t dddddgdddddgdddddgg}t dddddgdddddgdddddgg}| jjr| jjdkrt dddddgdddddgdddddgg}nd }|||d	S )
Nr   r   r   r   r   rV   r   r   )r   r   r   )r   r   rM   r   r   )rL   Zinputs_listZ
attns_listZ
langs_listr   r   r   dummy_inputs  s    ..0z!TFXLMPreTrainedModel.dummy_inputsN)ru   rv   rw   __doc__r   Zconfig_class#TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixpropertyr   r   r   r   r   r     s   r   a  

    .. note::

        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Parameters:
        config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        langs (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            A parallel sequence of tokens to be used to indicate the language of each token in the input.
            Indices are languages ids which can be obtained from the language names by using two conversion mappings
            provided in the configuration of the model (only provided for multilingual models).
            More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
            the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).

            See usage examples detailed in the `multilingual documentation <https://huggingface.co/transformers/multilingual.html>`__.
        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Length of each sentence that can be used to avoid performing attention on padding token indices.
            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
            Indices selected in ``[0, ..., input_ids.size(-1)]``:
        cache (:obj:`Dict[str, tf.Tensor]`, `optional`, defaults to :obj:`None`):
            dictionary with ``tf.Tensor`` that contains pre-computed
            hidden-states (key and values in the attention blocks) as computed by the model
            (see `cache` output below). Can be used to speed up sequential decoding.
            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
z\The bare XLM Model transformer outputing raw hidden-states without any specific head on top.c                       s,   e Zd Z fddZeedd Z  ZS )
TFXLMModelc                    s&   t  j|f|| t|dd| _d S )Nr   r8   )r<   r=   r   r   rL   rM   rh   rN   rO   r   r   r=   ?  s    zTFXLMModel.__init__c                 K   s   | j |f|}|S )a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import XLMTokenizer, TFXLMModel

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        )r   )rL   rh   rN   rs   r   r   r   rt   C  s    zTFXLMModel.callru   rv   rw   r=   r   XLM_INPUTS_DOCSTRINGrt   rz   r   r   rO   r   r   :  s   r   c                       s4   e Zd ZdZ fddZ fddZdd Z  ZS )TFXLMPredLayerz?
    Prediction layer (cross_entropy or adaptive_softmax).
    c                    s@   t  jf | |j| _|j| _|j| _|jdkr8|| _ntd S )NF)r<   r=   Zasmr   r   input_embeddingsrS   )rL   rM   r   rN   rO   r   r   r=   k  s    
zTFXLMPredLayer.__init__c                    s(   | j | jfdddd| _t | d S )NzerosTbias)r_   ZinitializerZ	trainabler8   )Z
add_weightr   r   r<   build)rL   Zinput_shaperO   r   r   r   |  s    zTFXLMPredLayer.buildc                 C   s   | j |dd}|| j }|S )NZlinear)mode)r   r   )rL   r   r   r   r   rt     s    
zTFXLMPredLayer.call)ru   rv   rw   r   r=   r   rt   rz   r   r   rO   r   r   f  s   r   z}The XLM Model transformer with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                       s<   e Zd Z fddZdd Zdd Zeedd Z  Z	S )	TFXLMWithLMHeadModelc                    s:   t  j|f|| t|dd| _t|| jjdd| _d S )Nr   r   zpred_layer_._proj)r<   r=   r   r   r   r   
pred_layerr   rO   r   r   r=     s    zTFXLMWithLMHeadModel.__init__c                 C   s   | j jS rQ   )r   r   r   r   r   r   get_output_embeddings  s    z*TFXLMWithLMHeadModel.get_output_embeddingsc                 K   sj   | j j}| j j}|jd }tj|dftjd| }tj||gdd}|d k	r\t|| }nd }||dS )Nr   r   r%   ra   )rh   r   )	rM   mask_token_idlang_idr_   r   Zonesr   re   Z	ones_like)rL   rh   rN   r   r   Zeffective_batch_sizeZ
mask_tokenr   r   r   r   prepare_inputs_for_generation  s    
z2TFXLMWithLMHeadModel.prepare_inputs_for_generationc                 K   s6   | j |f|}|d }| |}|f|dd  }|S )a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import XLMTokenizer, TFXLMWithLMHeadModel

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        r   r   N)r   r   )rL   rh   rN   transformer_outputsoutputrs   r   r   r   rt     s
    
zTFXLMWithLMHeadModel.call)
ru   rv   rw   r=   r   r   r   r   rt   rz   r   r   rO   r   r     s
   r   zXLM Model with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks. c                       s,   e Zd Z fddZeedd Z  ZS )TFXLMForSequenceClassificationc                    s@   t  j|f|| |j| _t|dd| _t||jdd| _d S )Nr   r   sequence_summaryr   )r<   r=   
num_labelsr   r   r   rF   r   r   rO   r   r   r=     s    z'TFXLMForSequenceClassification.__init__c                 K   s6   | j |f|}|d }| |}|f|dd  }|S )a  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import XLMTokenizer, TFXLMForSequenceClassification

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        labels = tf.constant([1])[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]

        r   r   N)r   r   )rL   rh   rN   r   r   logitsrs   r   r   r   rt     s
     
z#TFXLMForSequenceClassification.callr   r   r   rO   r   r     s   r   zXLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). c                       s,   e Zd Z fddZeedd Z  ZS )TFXLMForQuestionAnsweringSimplec                    sD   t  j|f|| t|dd| _tjjj|jt	|j
dd| _d S )Nr   r   
qa_outputsr7   )r<   r=   r   r   r   rC   rD   rE   r   r	   rF   r   r   rO   r   r   r=     s      z(TFXLMForQuestionAnsweringSimple.__init__c           	      K   sh   | j |f|}|d }| |}tj|ddd\}}tj|dd}tj|dd}||f|dd  }|S )a  
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
            Span-start scores (before SoftMax).
        end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        start_scores, end_scores = outputs[:2]

        r   r   rW   ra   r   N)r   r   r   splitZsqueeze)	rL   rh   rN   r   Zsequence_outputr   Zstart_logitsZ
end_logitsrs   r   r   r   rt     s    !

z$TFXLMForQuestionAnsweringSimple.callr   r   r   rO   r   r     s   r   )+r   rx   loggingr    Znumpyr   Z
tensorflowr   Zconfiguration_xlmr   Z
file_utilsr   r   Zmodeling_tf_utilsr   r   r   r	   r
   Ztokenization_utilsr   	getLoggerru   loggerr   r   r$   Zfloat32r4   rC   rD   ZLayerr5   r{   r   r   ZXLM_START_DOCSTRINGr   r   r   r   r   r   r   r   r   r   <module>   sj   
X  7(!?1