U
    &cG                     @   s   d Z ddlZddlZddlmZ ddlmZmZ ddlmZ ddl	m
Z
 eeZG dd dejZd	Zd
ZedeeG dd de
ZedeeG dd dejZdS )zPyTorch MMBT model.     N)CrossEntropyLossMSELoss   )add_start_docstrings)ModuleUtilsMixinc                       s*   e Zd ZdZ fddZdddZ  ZS )ModalEmbeddingszUGeneric Modal Embeddings which takes in an encoder, and a transformer embedding.
    c                    s\   t    || _|| _t|j|j| _|j	| _	|j
| _
|j| _|j| _tj|jd| _d S )N)p)super__init__configencodernnLinearZmodal_hidden_sizehidden_sizeproj_embeddingsposition_embeddingstoken_type_embeddingsword_embeddings	LayerNormDropouthidden_dropout_probdropout)selfr   r   
embeddings	__class__ >/tmp/pip-unpacked-wheel-ymerj3tt/transformers/modeling_mmbt.pyr
   $   s    
zModalEmbeddings.__init__Nc                 C   s  |  | |}|d}|d k	rL| |}|d7 }tj|d|gdd}|d k	r~| |}	|d7 }tj||	dgdd}|d krtj|tj|j	d}|d
|d|}|d krtj|d|ftj|j	d}| |}
| |}||
 | }| |}| |}|S )Nr   Zdimdtypedevicer   )r   r   sizer   torchcatZ	unsqueezeZarangelongr!   expandzerosr   r   r   r   )r   input_modalstart_token	end_tokenposition_idstoken_type_idsZtoken_embeddingsZ
seq_lengthZstart_token_embedsZend_token_embedsr   r   r   r   r   r   forward/   s2    


  



zModalEmbeddings.forward)NNNN__name__
__module____qualname____doc__r
   r-   __classcell__r   r   r   r   r       s   r   aV      MMBT model was proposed in
    `Supervised Multimodal Bitransformers for Classifying Images and Text`_
    by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
    It's a supervised multimodal bitransformer model that fuses information from text and other image encoders,
    and obtain state-of-the-art performance on various multimodal classification benchmark tasks.

    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
    refer to the PyTorch documentation for all matter related to general usage and behavior.

    .. _`Supervised Multimodal Bitransformers for Classifying Images and Text`:
        https://github.com/facebookresearch/mmbt

    .. _`torch.nn.Module`:
        https://pytorch.org/docs/stable/nn.html#module

    Parameters:
        config (:class:`~transformers.MMBTConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
        transformer (:class: `~nn.Module`): A text transformer that is used by MMBT.
            It should have embeddings, encoder, and pooler attributes.
        encoder (:class: `~nn.Module`): Encoder for the second modality.
            It should take in a batch of modal inputs and return k, n dimension embeddings.
a      Inputs:
        **input_modal**: ``torch.FloatTensor`` of shape ``(batch_size, ***)``:
            The other modality data. It will be the shape that the encoder for that type expects.
            e.g. With an Image Encoder, the shape would be (batch_size, channels, height, width)
        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
            It does not expect [CLS] token to be added as it's appended to the end of other modality embeddings.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **modal_start_tokens**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for Classification tasks.
        **modal_end_tokens**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Segment token indices to indicate different portions of the inputs.
        **modal_token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``:
            Segment token indices to indicate different portions of the non-text modality.
            The embeddings from these tokens will be summed with the respective token embeddings for the non-text modality.
        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Indices of positions of each input sequence tokens in the position embeddings.
        **modal_position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``:
            Indices of positions of each input sequence tokens in the position embeddings for the non-text modality.
        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        **encoder_hidden_states**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``:
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model
            is configured as a decoder.
        **encoder_attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
            is used in the cross-attention if the model is configured as a decoder.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
zRThe bare MMBT Model outputting raw hidden-states without any specific head on top.c                       s:   e Zd ZdZ fddZdddZdd Zd	d
 Z  ZS )	MMBTModela  
        Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
            **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
                Sequence of hidden-states at the output of the last layer of the model.
            **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
                Last layer hidden-state of the first token of the sequence (classification token)
                further processed by a Linear layer and a Tanh activation function. The Linear
                layer weights are trained from the next sentence prediction (classification)
                objective during Bert pretraining. This output is usually *not* a good summary
                of the semantic content of the input, you're often better with averaging or pooling
                the sequence of hidden-states for the whole input sequence.
            **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
                list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
                of shape ``(batch_size, sequence_length, hidden_size)``:
                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
            **attentions**: (`optional`, returned when ``config.output_attentions=True``)
                list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

        Examples::

            # For example purposes. Not runnable.
            transformer = BertModel.from_pretrained('bert-base-uncased')
            encoder = ImageEncoder(args)
            mmbt = MMBTModel(config, transformer, encoder)
        c                    s*   t    || _|| _t|||j| _d S N)r	   r
   r   transformerr   r   modal_encoderr   r   r6   r   r   r   r   r
      s    
zMMBTModel.__init__Nc                 C   s  |d k	r|d k	rt dn4|d k	r,| }n"|d k	rF| d d }nt d|d k	r\|jn|j}| j||||	|d}| d d }|d krtj|tj|d}| jj||||d}t	||gd}| d d }|d krtj||d}n tj	tj||tjd	|gdd
}|d kr&tj||d}ntj	tj||d|gdd
}| 
||| j}| |}| |
| jj}
| jj|||
||d}|d }| j|}||f|dd   }|S )NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embeds)r)   r*   r+   r,   r   )	input_idsr+   r,   inputs_embedsr   )r!   )r!   r    r   )attention_mask	head_maskencoder_hidden_statesencoder_attention_maskr   )
ValueErrorr"   r!   r7   r#   Zonesr%   r6   r   r$   Zget_extended_attention_maskZinvert_attention_maskZget_head_maskr   Znum_hidden_layersr   Zpooler)r   r(   r:   modal_start_tokensmodal_end_tokensr<   r,   modal_token_type_idsr+   modal_position_idsr=   r;   r>   r?   Zinput_txt_shaper!   Zmodal_embeddingsZinput_modal_shapeZtxt_embeddingsZembedding_outputZinput_shapeZextended_attention_maskZencoder_extended_attention_maskZencoder_outputsZsequence_outputpooled_outputoutputsr   r   r   r-      sl    

    
 

zMMBTModel.forwardc                 C   s   | j jS r5   r   r   )r   r   r   r   get_input_embeddings  s    zMMBTModel.get_input_embeddingsc                 C   s   || j _d S r5   rG   )r   valuer   r   r   set_input_embeddings  s    zMMBTModel.set_input_embeddings)NNNNNNNNNNNN)	r/   r0   r1   r2   r
   r-   rH   rJ   r3   r   r   r   r   r4      s"   	            
Rr4   zMMBT Model with a sequence classification/regression head on top (a linear layer on top of
                      the pooled output)c                       s*   e Zd ZdZ fddZdddZ  ZS )MMBTForClassificationa  
            **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
                Labels for computing the sequence classification/regression loss.
                Indices should be in ``[0, ..., config.num_labels - 1]``.
                If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
                If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

        Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
            **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
                Classification (or regression if config.num_labels==1) loss.
            **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
                Classification (or regression if config.num_labels==1) scores (before SoftMax).
            **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
                list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
                of shape ``(batch_size, sequence_length, hidden_size)``:
                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
            **attentions**: (`optional`, returned when ``config.output_attentions=True``)
                list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

        Examples::

            # For example purposes. Not runnable.
            transformer = BertModel.from_pretrained('bert-base-uncased')
            encoder = ImageEncoder(args)
            model = MMBTForClassification(config, transformer, encoder)
            outputs = model(input_modal, input_ids, labels=labels)
            loss, logits = outputs[:2]
        c                    sD   t    |j| _t|||| _t|j| _t	|j
|j| _d S r5   )r	   r
   
num_labelsr4   mmbtr   r   r   r   r   r   
classifierr8   r   r   r   r
   8  s
    
zMMBTForClassification.__init__Nc                 C   s   | j |||||||||	|
|d}|d }| |}| |}|f|dd   }|d k	r| jdkr~t }||d|d}n t }||d| j|d}|f| }|S )N)r(   r:   rA   rB   r<   r,   rC   r+   rD   r=   r;   r      r9   )rM   r   rN   rL   r   viewr   )r   r(   r:   rA   rB   r<   r,   rC   r+   rD   r=   r;   labelsrF   rE   ZlogitsZloss_fctZlossr   r   r   r-   @  s2    



zMMBTForClassification.forward)NNNNNNNNNNNr.   r   r   r   r   rK     s              rK   )r2   loggingr#   Ztorch.nnr   r   r   Z
file_utilsr   Zmodeling_utilsr   	getLoggerr/   loggerModuler   ZMMBT_START_DOCSTRINGZMMBT_INPUTS_DOCSTRINGr4   rK   r   r   r   r   <module>   s,   
.-{