U
    &c5                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZmZ ddl	m
Z
mZ eeZeG dd dZedd	G d
d dZG dd dZG dd deZdS )    N)	dataclass)ListOptionalUnion   )is_tf_availableis_torch_availablec                   @   sJ   e Zd ZU dZeed< eed< dZee ed< dZee ed< dd Z	dS )	InputExamplea5  
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
    guidtext_aNtext_blabelc                 C   s   t jt| ddd S )*Serializes this instance to a JSON string.   )indent
jsondumpsdataclassesZasdictself r   F/tmp/pip-unpacked-wheel-ymerj3tt/transformers/data/processors/utils.pyto_json_string2   s    zInputExample.to_json_string)
__name__
__module____qualname____doc__str__annotations__r   r   r   r   r   r   r   r   r	      s   
r	   T)frozenc                   @   sf   e Zd ZU dZee ed< dZeee  ed< dZ	eee  ed< dZ
eeeef  ed< dd ZdS )	InputFeaturesa  
    A single set of features of data.
    Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    	input_idsNattention_masktoken_type_idsr   c                 C   s   t t| d S )r   r   r   r   r   r   r   r   M   s    zInputFeatures.to_json_string)r   r   r   r   r   intr    r$   r   r%   r   r   floatr   r   r   r   r   r"   7   s   
r"   c                   @   sF   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Ze	dddZ
dS )DataProcessorzEBase class for data converters for sequence classification data sets.c                 C   s
   t  dS )zGets an example from a dict with tensorflow tensors
        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        NNotImplementedError)r   Ztensor_dictr   r   r   get_example_from_tensor_dictU   s    z*DataProcessor.get_example_from_tensor_dictc                 C   s
   t  dS )z7Gets a collection of `InputExample`s for the train set.Nr)   r   data_dirr   r   r   get_train_examples]   s    z DataProcessor.get_train_examplesc                 C   s
   t  dS )z5Gets a collection of `InputExample`s for the dev set.Nr)   r,   r   r   r   get_dev_examplesa   s    zDataProcessor.get_dev_examplesc                 C   s
   t  dS )z*Gets the list of labels for this data set.Nr)   r   r   r   r   
get_labelse   s    zDataProcessor.get_labelsc                 C   s(   t |  dkr$|  t|j |_|S )zSome tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
        This method converts examples to the correct format.   )lenr0   r&   r   )r   exampler   r   r   tfds_mapi   s    zDataProcessor.tfds_mapNc              
   C   s:   t |ddd"}ttj|d|dW  5 Q R  S Q R X dS )z!Reads a tab separated value file.rz	utf-8-sig)encoding	)	delimiter	quotecharN)openlistcsvreader)clsZ
input_filer9   fr   r   r   	_read_tsvp   s    zDataProcessor._read_tsv)N)r   r   r   r   r+   r.   r/   r0   r4   classmethodr@   r   r   r   r   r(   R   s   r(   c                   @   sd   e Zd ZdZdddZdd Zd	d
 ZedddZedddZ	dddZ
dddZdddZdS )%SingleSentenceClassificationProcessorzA Generic processor for a single sentence classification data set.NclassificationFc                 C   s4   |d krg n|| _ |d krg n|| _|| _|| _d S N)labelsexamplesmodeverbose)r   rE   rF   rG   rH   r   r   r   __init__z   s    z.SingleSentenceClassificationProcessor.__init__c                 C   s
   t | jS rD   )r2   rF   r   r   r   r   __len__   s    z-SingleSentenceClassificationProcessor.__len__c                 C   s(   t |trt| j| j| dS | j| S )N)rE   rF   )
isinstanceslicerB   rE   rF   )r   idxr   r   r   __getitem__   s    
z1SingleSentenceClassificationProcessor.__getitem__ r   r1   c           	   
   K   s(   | f |}|j ||||||ddd |S )NT)
split_namecolumn_labelcolumn_text	column_idskip_first_rowoverwrite_labelsoverwrite_examples)add_examples_from_csv)	r>   	file_namerP   rQ   rR   rS   rT   kwargs	processorr   r   r   create_from_csv   s    

z5SingleSentenceClassificationProcessor.create_from_csvc                 K   s   | f |}|j ||d |S )N)rE   )add_examples)r>   texts_or_text_and_labelsrE   rY   rZ   r   r   r   create_from_examples   s    
z:SingleSentenceClassificationProcessor.create_from_examplesc	                 C   s   |  |}	|r|	dd  }	g }
g }g }t|	D ]^\}}|
||  |||  |d k	rj|||  q.|rzd||f nd| }|| q.| j|
||||dS )Nr1   z%s-%sz%s)rU   rV   )r@   	enumerateappendr\   )r   rX   rP   rQ   rR   rS   rT   rU   rV   linesZtextsrE   idsiliner
   r   r   r   rW      s(    
    z;SingleSentenceClassificationProcessor.add_examples_from_csvc              	   C   s  |d kst |t |kst|d ks8t |t |ks8t|d krNd gt | }|d krdd gt | }g }t }t|||D ]N\}}	}
t|ttfr|	d kr|\}}	n|}||	 |t	|
|d |	d qz|r|| _
n| j
| |rt|| _ntt| j|| _| j
S )N)r
   r   r   r   )r2   AssertionErrorsetziprK   tupler;   addr`   r	   rF   extendrE   union)r   r]   rE   rb   rU   rV   rF   Zadded_labelsZtext_or_text_and_labelr   r
   textr   r   r   r\      s*    

z2SingleSentenceClassificationProcessor.add_examplesTc                    s  |dkr|j }dd t| jD }g }t| jD ]D\}	}
|	d dkrPtd|	 |j|
jdt||j d}|	| q0t
d	d
 |D }g  tt|| jD ]\}	\}}
|	d dkrtd|	t| jf  |rdndgt| }|t| }|r|g| | }|rdndg| | }n&||g|  }||r8dndg|  }t||ksftdt||t||kstdt||| jdkr||
j }n"| jdkrt|
j}n
t| j|	dk r>| jr>td td|
j  tdddd |D   tdddd |D   td|
j|f   	t|||d q|dkrb S |dkrt s|tdddl} fdd}|jj||j|jd|jf|dg|dgd|g f}|S |d krt  std!ddl!}dd"l"m#} |j$d#d  D |j%d$}|j$d%d  D |j%d$}| jdkr`|j$d&d  D |j%d$}n&| jdkr|j$d'd  D |jd$}||||}|S td(dS ))a  
        Convert examples in a list of ``InputFeatures``

        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            task: GLUE task
            label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
            output_mode: String indicating the output mode. Either ``regression`` or ``classification``
            pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
                and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
                actual values)

        Returns:
            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
            containing the task-specific features. If the input is a list of ``InputExamples``, will return
            a list of task-specific ``InputFeatures`` which can be fed to the model.

        Nc                 S   s   i | ]\}}||qS r   r   ).0rc   r   r   r   r   
<dictcomp>   s      zFSingleSentenceClassificationProcessor.get_features.<locals>.<dictcomp>i'  r   zTokenizing example %dT)Zadd_special_tokens
max_lengthc                 s   s   | ]}t |V  qd S rD   )r2   )rm   r#   r   r   r   	<genexpr>  s     zESingleSentenceClassificationProcessor.get_features.<locals>.<genexpr>zWriting example %d/%dr1   z Error with input length {} vs {}rC   Z
regression   z*** Example ***zguid: %szinput_ids: %s c                 S   s   g | ]}t |qS r   r   rm   xr   r   r   
<listcomp>/  s     zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>zattention_mask: %sc                 S   s   g | ]}t |qS r   rs   rt   r   r   r   rv   0  s     zlabel: %s (id = %d)r#   r$   r   tfz?return_tensors set to 'tf' but TensorFlow 2.0 can't be importedc                  3   s$    D ]} | j | jd| jfV  qd S )Nr#   r$   rw   )exfeaturesr   r   gen<  s    z?SingleSentenceClassificationProcessor.get_features.<locals>.genry   ptz8return_tensors set to 'pt' but PyTorch can't be imported)TensorDatasetc                 S   s   g | ]
}|j qS r   )r#   rm   r?   r   r   r   rv   L  s     )Zdtypec                 S   s   g | ]
}|j qS r   )r$   r   r   r   r   rv   M  s     c                 S   s   g | ]
}|j qS r   r   r   r   r   r   rv   O  s     c                 S   s   g | ]
}|j qS r   r   r   r   r   r   rv   Q  s     z,return_tensors should be one of 'tf' or 'pt')&max_lenr_   rE   rF   loggerinfoencoder   minr`   maxrg   r2   re   formatrG   r   r'   
ValueErrorrH   r
   joinr"   r   RuntimeErrorZ
tensorflowdataZDatasetZfrom_generatorZint32Zint64ZTensorShaper   torchZtorch.utils.datar   Ztensorlong)r   	tokenizerro   Zpad_on_leftZ	pad_tokenZmask_padding_with_zeroZreturn_tensorsZ	label_mapZall_input_idsZex_indexr3   r#   Zbatch_lengthr$   Zpadding_lengthr   rx   r}   Zdatasetr   r   Zall_attention_maskZ
all_labelsr   r{   r   get_features   s      
  



"
z2SingleSentenceClassificationProcessor.get_features)NNrC   F)rO   r   r1   NF)N)rO   r   r1   NFFF)NNFF)NFr   TN)r   r   r   r   rI   rJ   rN   rA   r[   r^   rW   r\   r   r   r   r   r   rB   w   s@   
                
       
$     rB   )r<   r   r   loggingr   typingr   r   r   Z
file_utilsr   r   	getLoggerr   r   r	   r"   r(   rB   r   r   r   r   <module>   s   
%