U
    &c$                     @   sP  d Z ddlZddlZddlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZmZ ddlmZ eeZdd	d
dddZdd ZG dd dejjjZG dd dejjjZG dd dejjjZeG dd dejjjZ G dd deZ!dZ"dZ#e	de"G dd de!Z$e	de"G d d! d!e!Z%e	d"e"G d#d$ d$e!Z&dS )%z TF 2.0 OpenAI GPT-2 model.     N   )
GPT2Config)add_start_docstrings add_start_docstrings_to_callable)TFConv1DTFPreTrainedModelTFSequenceSummaryTFSharedEmbeddingsget_initializerkeras_serializable
shape_list)BatchEncodingz+https://cdn.huggingface.co/gpt2-tf_model.h5z2https://cdn.huggingface.co/gpt2-medium-tf_model.h5z1https://cdn.huggingface.co/gpt2-large-tf_model.h5z.https://cdn.huggingface.co/gpt2-xl-tf_model.h5z1https://cdn.huggingface.co/distilgpt2-tf_model.h5)Zgpt2zgpt2-mediumz
gpt2-largezgpt2-xlZ
distilgpt2c                 C   s:   ddt tdtj | dt | d     }| | S )zGaussian Error Linear Unit.
    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
        x: float Tensor to perform activation.
    Returns:
        `x` with the GELU activation applied.
    g      ?      ?   gHm?   )tftanhnpsqrtpipow)xZcdf r   A/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_tf_gpt2.pygelu1   s    	2r   c                       sV   e Zd Zd fdd	Zdd Zedd Zddd	Zd
d Zdd Z	dddZ
  ZS )TFAttentionFc                    s   t  jf | |j| _|}||j dks,t|| _|j| _|| _|| _t|d ||j	dd| _
t|||j	dd| _tjj|j| _tjj|j| _t | _d S )Nr   r   c_attninitializer_rangenamec_proj)super__init__output_attentionsn_headAssertionErrorn_ctxZ
split_sizescaler   r   r   r    r   keraslayersDropoutZ
attn_pdropattn_dropoutresid_pdropresid_dropoutsetZpruned_heads)selfnxr&   configr'   kwargsn_state	__class__r   r   r"   ?   s    zTFAttention.__init__c                 C   s   d S Nr   )r/   Zheadsr   r   r   prune_headsQ   s    zTFAttention.prune_headsc                 C   s<   t | dddf }t |}||| |  k}t ||S )z1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        N)r   rangecast)ndnsdtypeijmr   r   r   causal_attention_maskT   s    
z!TFAttention.causal_attention_maskc                 C   s   |\}}}}}t j||dd}| jrJt t|d t j}	|t j|	 }t|\}
}
}}| j|||j	d}t 
|dd||g}|| dd|   }|d k	r|| }t jj|dd}| j||d}|d k	r|| }t ||g}| jr|| |S )	NT)Ztranspose_br<   r   g     @Zaxistraining)r   matmulr'   r9   r   float32mathr   r@   r<   reshapennZsoftmaxr+   r#   append)r/   inputsrE   qkvattention_mask	head_maskwZdk_r:   r;   boutputsr   r   r   _attn^   s&    
zTFAttention._attnc                 C   sF   t |ddddg}t|}|d d |d |d  g }t ||S )Nr   r   r   r   rA   )r   	transposer   rI   r/   r   Zx_shapeZnew_x_shaper   r   r   merge_heads|   s    zTFAttention.merge_headsc                 C   s@   t |}|d d | j|d | j g }t||}t|dS )NrA   )r   r   r   r   )r   r$   r   rI   rX   rY   r   r   r   split_heads   s     zTFAttention.split_headsc                 C   s*  |\}}}}}|  |}tj|ddd\}}	}
| |}| |	}	| |
}
|d k	rtj|dd\}}tj||	gdd}	tj||
gdd}
t|rt|drt|	 }nd}|dkrtj
|	|
gdd}nd}| j||	|
||g|d	}|d }| |}| |}| j||d	}||g|d
d   }|S )Nr   r   rC   r   rW   numpyTr6   rD   r   )r   r   splitr[   ZunstackconcatZ	is_tensorhasattrboolr\   stackrV   rZ   r    r-   )r/   rL   rE   r   
layer_pastrP   rQ   	use_cachequerykeyvalueZpast_keyZ
past_valuepresentZattn_outputsarU   r   r   r   call   s0    







zTFAttention.call)F)F)F)__name__
__module____qualname__r"   r7   staticmethodr@   rV   rZ   r[   ri   __classcell__r   r   r4   r   r   >   s   
	
r   c                       s&   e Zd Z fddZdddZ  ZS )TFMLPc                    sX   t  jf | |j}t|||jdd| _t|||jdd| _t| _t	j
j|j| _d S )Nc_fcr   r    )r!   r"   n_embdr   r   rp   r    r   actr   r(   r)   r*   r,   dropout)r/   r3   r1   r2   r0   r4   r   r   r"      s    zTFMLP.__init__Fc                 C   s,   |  | |}| |}| j||d}|S )NrD   )rr   rp   r    rs   )r/   r   rE   hh2r   r   r   ri      s    
z
TFMLP.call)Frj   rk   rl   r"   ri   rn   r   r   r4   r   ro      s   ro   c                       s(   e Zd Zd fdd	ZdddZ  ZS )TFBlockFc                    sl   t  jf | |j}tjjj|jdd| _t	||||dd| _
tjjj|jdd| _td| |dd| _d S )Nln_1epsilonr   attnr   ln_2   mlp)r!   r"   rq   r   r(   r)   LayerNormalizationlayer_norm_epsilonrx   r   r{   r}   ro   r   )r/   r&   r1   r'   r2   r0   r4   r   r   r"      s    zTFBlock.__init__c                 C   sv   |\}}}}}|  |}| j|||||g|d}	|	d }|| }| |}
| j|
|d}
||
 }|g|	dd   }|S )NrD   r   r   )rx   r{   r}   r   )r/   rL   rE   r   rb   rP   rQ   rc   rh   Zoutput_attnr?   rU   r   r   r   ri      s    

zTFBlock.call)F)Frv   r   r   r4   r   rw      s   rw   c                       sB   e Zd ZeZ fddZdd Zdd Zdd ZdddZ	  Z
S )TFGPT2MainLayerc                    s   t  j||  j| _ j| _ j| _ j| _ j| _t j j	 j
dd| _tjjj j jt j
dd| _tjj j| _ fddt jD | _tjjj jdd| _d S )	Nwter   wpe)Zembeddings_initializerr   c              	      s$   g | ]}t  j d d|dqS )Tzh_._{})r'   r   )rw   r&   format).0r=   r1   r   r   
<listcomp>   s     z,TFGPT2MainLayer.__init__.<locals>.<listcomp>ln_fry   )r!   r"   output_hidden_statesr#   Zn_layernum_hidden_layersZ
vocab_sizerq   r	   Zhidden_sizer   r   r   r(   r)   Z	EmbeddingZn_positionsr
   r   r*   Z
embd_pdropdropr8   rt   r   r   r   r/   r1   rL   r2   r4   r   r   r"      s*       zTFGPT2MainLayer.__init__c                 C   s   | j S r6   )r   r/   r   r   r   get_input_embeddings   s    z$TFGPT2MainLayer.get_input_embeddingsc                 C   s   t d S r6   NotImplementedError)r/   Znew_num_tokensr   r   r   _resize_token_embeddings   s    z(TFGPT2MainLayer._resize_token_embeddingsc                 C   s   t dS )zz Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        Nr   )r/   Zheads_to_pruner   r   r   _prune_heads   s    zTFGPT2MainLayer._prune_headsNTFc
                    s  t |ttfr|d }
t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkr|d n|}t|dkr|d n|}t|d	kstd
nt |ttfrZ|d}
|d|}|d|}|d|}|d|}|d|}|d|}|d|}t|d	ks^td
n|}
|d k	r|
d k	r|
d d dd f }
|d k	r|d d dd f }|d k	r|d d dd f }|
d k	r|d k	rtdnL|
d k	rt	|
}t
|
d|d g}
n$|d k	r$t	|d d }ntd|d krLd}d gt| j }nt	|d d d }|d krt
j||d | t
jdt
jd d f }|d k	r|d d t
jt
jd d f }t
|t
j}d| d }nd }|d k	rtnd g| j }t
|dt	|d g}|d kr$| j|
dd}| |}|d k	r`t
|dt	|d g}| j|dd}nd}|| | }| j||	d}|t	|d g }d}g }d}tt| j|D ]r\}\}}| jr|t
||f }|||||| |g|	d}|d d \}}||f }| jr||d  q| |}t
||}| jrH||f }|f}|dkrb||f }| jrt||f }| jr|d d dg t	|d dd    t fdd |D }||f }|S )!Nr   r   r   r   r~               Too many inputs.	input_idspastrP   token_type_idsposition_idsrQ   inputs_embedsrc   rA   zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsrW   rB   r   g     Z	embeddingmoderD   r   Tc                 3   s   | ]}t | V  qd S r6   )r   rI   )r   tZattention_output_shaper   r   	<genexpr>  s     z'TFGPT2MainLayer.call.<locals>.<genexpr>)
isinstancetuplelistlenr%   dictr   get
ValueErrorr   r   rI   rt   r8   Zint32Znewaxisr9   rG   r   r   r   r   r   	enumeratezipr   r#   rK   r   )r/   rL   r   rP   r   r   rQ   r   rc   rE   r   Zinput_shapeZpast_lengthZposition_embedsZtoken_type_embedshidden_statesZoutput_shapeZpresentsZall_attentionsZall_hidden_statesr=   blockrb   rU   rg   r   r   r   ri      s    









(










&
zTFGPT2MainLayer.call)NNNNNNTF)rj   rk   rl   r   config_classr"   r   r   r   ri   rn   r   r   r4   r   r      s   	        r   c                   @   s   e Zd ZdZeZeZdZdS )TFGPT2PreTrainedModelz An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    transformerN)	rj   rk   rl   __doc__r   r   $TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAPZpretrained_model_archive_mapZbase_model_prefixr   r   r   r   r     s   r   a  

    .. note::
        TF 2.0 models accepts two formats as inputs:

            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.

        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.

        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
        in the first positional argument :

        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Parameters:
        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
a  
    Args:
        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
            If `past` is used, optionally only the last `input_ids` have to be input (see `past`).

            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.

            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
            If `past` is used, optionally only the last `token_type_ids` have to be input (see `past`).

            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.

            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
            If `past` is used, optionally only the last `input_embeds` have to be input (see `past`).
        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
            (if set to :obj:`False`) for evaluation.
z]The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.c                       s,   e Zd Z fddZeedd Z  ZS )TFGPT2Modelc                    s&   t  j|f|| t|dd| _d S Nr   r|   r!   r"   r   r   r   r4   r   r   r"     s    zTFGPT2Model.__init__c                 K   s   | j |f|}|S )a$  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import GPT2Tokenizer, TFGPT2Model

        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = TFGPT2Model.from_pretrained('gpt2')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    )r   )r/   rL   r2   rU   r   r   r   ri     s    #zTFGPT2Model.call)rj   rk   rl   r"   r   GPT2_INPUTS_DOCSTRINGri   rn   r   r   r4   r   r     s   r   z~The GPT2 Model transformer with a language modeling head on top
    (linear layer with weights tied to the input embeddings). c                       s<   e Zd Z fddZdd Zdd Zeedd Z  Z	S )	TFGPT2LMHeadModelc                    s&   t  j|f|| t|dd| _d S r   r   r   r4   r   r   r"     s    zTFGPT2LMHeadModel.__init__c                 C   s   | j jS r6   r   r   r   r   r   r   get_output_embeddings  s    z'TFGPT2LMHeadModel.get_output_embeddingsc                 K   s,   |rt |d d df d}|||d dS )NrA   rc   )rL   r   rc   )r   Zexpand_dims)r/   rL   r   r2   r   r   r   prepare_inputs_for_generation  s    z/TFGPT2LMHeadModel.prepare_inputs_for_generationc                 K   s<   | j |f|}|d }| j j|dd}|f|dd  }|S )a  
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = TFGPT2LMHeadModel.from_pretrained('gpt2')

        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]

        r   linearr   r   Nr   )r/   rL   r2   transformer_outputsr   	lm_logitsrU   r   r   r   ri   $  s
    $zTFGPT2LMHeadModel.call)
rj   rk   rl   r"   r   r   r   r   ri   rn   r   r   r4   r   r     s
   r   ar  The GPT2 Model transformer with a language modeling and a multiple-choice classification
    head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
    The language modeling head has its weights tied to the input embeddings,
    the classification head takes as input the input of a specified classification token index in the input sequence).
c                
       s6   e Zd Z fddZdd Zeed
dd	Z  ZS )TFGPT2DoubleHeadsModelc                    s>   t  j|f|| d|_t|dd| _t||jdd| _d S )Nr   r   r|   multiple_choice_headr   )r!   r"   Z
num_labelsr   r   r   r   r   r   r4   r   r   r"   [  s      zTFGPT2DoubleHeadsModel.__init__c                 C   s   | j jS r6   r   r   r   r   r   r   c  s    z,TFGPT2DoubleHeadsModel.get_output_embeddingsNTFc                 C   s  t |ttfr|d }t|dkr*|d n|}t|dkrB|d n|}t|dkrZ|d n|}t|dkrr|d n|}t|dkr|d n|}t|dkr|d n|}t|dkr|d n|}t|d	kr|d	 n|	}	t|d
kstdnt |trz|d}|d|}|d|}|d|}|d|}|d|}|d|}|d|}|d|	}	t|d
ks~tdn|}|dk	rt|}nt|dd }|d }|dk	rt	|d|fnd}|dk	rt	|d|fnd}|dk	r t	|d|fnd}|dk	rt	|d|fnd}||||||||	g}| j
||
d}|d }t	||t|dd  }| j
j|dd}| j||g|
d}tj|dd}||f|dd  }|S )a|  
        mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
            Index of the classification token in each input sequence.
            Selected in the range ``[0, input_ids.size(-1) - 1[``.

    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
        lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.


    Examples::

        # For example purposes. Not runnable.
        import tensorflow as tf
        from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel

        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')

        # Add a [CLS] to the vocabulary (we should train it also!)
        # This option is currently not implemented in TF 2.0
        raise NotImplementedError
        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary

        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
        encoded_choices = [tokenizer.encode(s) for s in choices]
        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

        input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
        mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1

        outputs = model(input_ids, mc_token_ids=mc_token_ids)
        lm_prediction_scores, mc_prediction_scores = outputs[:2]

        r   r   r   r   r~   r   r   r   r   	   r   r   r   rP   r   r   rQ   r   mc_token_idsrc   NrA   rD   r   r   rC   )r   r   r   r   r%   r   r   r   r   rI   r   r   r   Zsqueeze)r/   rL   r   rP   r   r   rQ   r   r   rc   rE   r   Zinput_shapesZ
seq_lengthZflat_input_idsZflat_attention_maskZflat_token_type_idsZflat_position_idsZflat_inputsr   r   r   Z	mc_logitsrU   r   r   r   ri   f  s`    E


zTFGPT2DoubleHeadsModel.call)	NNNNNNNTF)	rj   rk   rl   r"   r   r   r   ri   rn   r   r   r4   r   r   R  s   	         r   )'r   loggingr\   r   Z
tensorflowr   Zconfiguration_gpt2r   Z
file_utilsr   r   Zmodeling_tf_utilsr   r   r   r	   r
   r   r   Ztokenization_utilsr   	getLoggerrj   loggerr   r   r(   r)   ZLayerr   ro   rw   r   r   ZGPT2_START_DOCSTRINGr   r   r   r   r   r   r   r   <module>   sN   $	
	p 5
0,=