U
    &c                     @   s   d dl mZmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZ ddlmZ G dd	 d	eZe	d
eZeG dd deZeG dd deZdS )    )ABCabstractmethod)	dataclass)AnyDictListNewTypeTupleN)pad_sequence   )PreTrainedTokenizerc                   @   s,   e Zd ZdZeeeejf dddZ	dS )DataCollatorz
    A `DataCollator` is responsible for batching
    and pre-processing samples of data as requested by the training loop.
    )returnc                 C   s   dS )z
        Take a list of samples from a Dataset and collate them into a batch.

        Returns:
            A dictionary of tensors
        N selfr   r   C/tmp/pip-unpacked-wheel-ymerj3tt/transformers/data/data_collator.pycollate_batch   s    zDataCollator.collate_batchN)
__name__
__module____qualname____doc__r   r   strtorchTensorr   r   r   r   r   r      s   r   InputDataClassc                   @   s.   e Zd ZdZee eeej	f dddZ
dS )DefaultDataCollatora  
    Very simple data collator that:
    - simply collates batches of dict-like objects
    - Performs special handling for potential keys named:
        - `label`: handles a single value (int or float) per object
        - `label_ids`: handles a list of values per object
    - does not do any additional preprocessing

    i.e., Property names of the input object will be used as corresponding inputs to the model.
    See glue and ner for example of how it's useful.
    )featuresr   c                    s*  |d }t |drj|jd k	rjt|jtkrFtjdd |D tjd}ntjdd |D tjd}d|i}njt |dr|jd k	rt|jd tkrtjd	d |D tjd}ntjd
d |D tjd}d|i}ni }t	|
 D ]D\ } dkr|d k	rt|tstj fdd|D tjd| < q|S )Nr   labelc                 S   s   g | ]
}|j qS r   r   .0fr   r   r   
<listcomp>9   s     z5DefaultDataCollator.collate_batch.<locals>.<listcomp>Zdtypec                 S   s   g | ]
}|j qS r   r   r    r   r   r   r#   ;   s     labels	label_idsc                 S   s   g | ]
}|j qS r   r&   r    r   r   r   r#   ?   s     c                 S   s   g | ]
}|j qS r   r'   r    r   r   r   r#   A   s     )r   r&   c                    s   g | ]}t | qS r   )getattrr    kr   r   r#   J   s     )hasattrr   typeintr   tensorlongfloatr&   varsitems
isinstancer   )r   r   firstr%   batchvr   r)   r   r   -   s     

$z!DefaultDataCollator.collate_batchN)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r      s   r   c                   @   s   e Zd ZU dZeed< dZeed< dZe	ed< e
ej eeejf ddd	Ze
ej ejdd
dZejeejejf dddZdS )DataCollatorForLanguageModelingz
    Data collator used for language modeling.
    - collates batches of tensors, honoring their tokenizer's pad_token
    - preprocesses batches for masked language modeling
    	tokenizerTmlmg333333?mlm_probability)examplesr   c                 C   s6   |  |}| jr(| |\}}||dS ||dS d S )N)	input_idsZmasked_lm_labels)r<   r%   )_tensorize_batchr9   mask_tokens)r   r;   r5   inputsr%   r   r   r   r   Z   s
    

z-DataCollatorForLanguageModeling.collate_batchc                    sn   |d  d t fdd|D }|r6tj|ddS | jjd krXtd| jjj dt	|d| jj
dS d S )	Nr   c                 3   s   | ]}| d  kV  qdS )r   N)size)r!   xZlength_of_firstr   r   	<genexpr>d   s     zCDataCollatorForLanguageModeling._tensorize_batch.<locals>.<genexpr>)ZdimzCYou are attempting to pad samples but the tokenizer you are using (z) does not have one.T)Zbatch_firstZpadding_value)r@   allr   stackr8   
_pad_token
ValueError	__class__r   r
   pad_token_id)r   r;   Zare_tensors_same_lengthr   rB   r   r=   b   s    z0DataCollatorForLanguageModeling._tensorize_batch)r?   r   c           
         s   j jdkrtd| }t|j j} fdd| D }|j	tj
|tjddd  j jdk	r| j j}|j	|dd t| }d|| < tt|jd	 |@ } j  j j||< tt|jd
 |@ | @ }tjt j |jtjd}	|	| ||< ||fS )zw
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        NzThis tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer.c                    s   g | ]} j j|d dqS )T)Zalready_has_special_tokens)r8   Zget_special_tokens_mask)r!   valr   r   r   r#   |   s    z?DataCollatorForLanguageModeling.mask_tokens.<locals>.<listcomp>r$   g        )valueig?g      ?)r8   Z
mask_tokenrG   cloner   fullshaper:   tolistZmasked_fill_r.   boolrF   eqrI   Z	bernoulliZconvert_tokens_to_idsrandintlenr/   )
r   r?   r%   Zprobability_matrixZspecial_tokens_maskZpadding_maskZmasked_indicesZindices_replacedZindices_randomZrandom_wordsr   r   r   r>   o   s*    

"z+DataCollatorForLanguageModeling.mask_tokensN)r   r   r   r   r   __annotations__r9   rP   r:   r0   r   r   r   r   r   r   r=   r	   r>   r   r   r   r   r7   N   s   
 r7   )abcr   r   Zdataclassesr   typingr   r   r   r   r	   r   Ztorch.nn.utils.rnnr
   Ztokenization_utilsr   r   r   r   r7   r   r   r   r   <module>   s   
.