U
    &cp                     @   s@  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZ	d dl
m
Z
 ddlmZmZ ddlmZ dd	lmZ e rd dlZd d
lmZ e rd dlZeeZdd Zdd Zdd Zdd Zdd Zdd Zd'ddZ G dd deZ!G dd de!Z"G dd  d e!Z#G d!d" d"e$Z%G d#d$ d$e$Z&G d%d& d&e$Z'dS )(    N)partial)Pool	cpu_count)tqdm   )is_tf_availableis_torch_available)whitespace_tokenize   )DataProcessor)TensorDatasetc           	      C   sp   d ||}t||d D ]H}t||d dD ]2}d | ||d  }||kr2||f    S q2q||fS )zFReturns tokenized answer spans that better match the annotated answer. r
   )jointokenizerange)	
doc_tokensZinput_startZ	input_end	tokenizerZorig_answer_textZtok_answer_textZ	new_startZnew_endZ	text_span r   F/tmp/pip-unpacked-wheel-ymerj3tt/transformers/data/processors/squad.py_improve_answer_span   s    r   c                 C   s   d}d}t | D ]l\}}|j|j d }||jk r4q||kr>q||j }|| }	t||	d|j  }
|dkst|
|kr|
}|}q||kS ):Check if this is the 'max context' doc span for the token.Nr
   {Gz?)	enumeratestartlengthminZ	doc_spansZcur_span_indexpositionZ
best_scoreZbest_span_indexZ
span_indexZdoc_spanendZnum_left_contextZnum_right_contextZscorer   r   r   _check_is_max_context&   s    

r    c                 C   s   d}d}t | D ]v\}}|d |d  d }||d k r:q||krDq||d  }|| }	t||	d|d   }
|dks~|
|kr|
}|}q||kS )r   Nr   r   r
   r   )r   r   r   r   r   r   _new_check_is_max_context:   s    r!   c                 C   s4   | dks,| dks,| dks,| dks,t | dkr0dS dS )Nr   	
i/   TF)ord)cr   r   r   _is_whitespaceP   s    ,r'   c           *      C   s.  g }|rb| j sb| j}| j}d| j||d  }dt| j}	||	dkrbt	d||	 g S g }
g }g }t
| jD ]>\}}|t| t|}|D ]}|
| || qqx|r| j s|| j }| jt| jd k r|| jd  d }nt|d }t|||t| j\}}g }tj| jd|d}dtttksTdtttkrdtjtj d n
tjtj }tjtj }|}t|| t|k rZtjtjd	kr|n|tjd	kr|n||d
d
|| t| | tjd	krdndd
d}tt|t||  |t| | }tj|d krtjd	krP|d d |d tj }n>t|d d |d d d d tj }|d |d d  }n|d }t|}i }t|D ]>}tjd	krt|| | n|}|
t|| |  ||< q||d< ||d< ||d< t|| |d< i |d< t|| |d< ||d< || d|krNqZ|d }qtt|D ]b}t|| d D ]J} t|||| |  }!tjdkr| n|| d |  }|!|| d |< qzqf|D ]X}"|"d tj}#t !|"d }$t "|$d}$tjd	krd|$ }$d|$t #t !|"d tj$kd < d|$|#< | j }%d}d}|r|%s|"d }&|"d |"d  d }'d}(||&kr||'ksd
}(|(r|#}|#}d
}%n6tjdkrd})nt|| })||& |) }||& |) }|t%|"d |"d |"d |#|$& dd|"d |"d |"d |"d |||%| j'd q|S )Nr   r
   r   z$Could not find answer: '%s' vs. '%s'F)Zadd_special_tokens
max_lengthZrobertaZ	camembertrightTZonly_secondZ
only_first)r(   Zreturn_overflowing_tokensZpad_to_max_lengthZstrideZtruncation_strategyZreturn_token_type_ids	input_idsparagraph_lentokenstoken_to_orig_mapZ*truncated_query_with_special_tokens_lengthtoken_is_max_contextr   r   Zoverflowing_tokenslefttoken_type_idsr   attention_mask)
example_index	unique_idr+   r.   r,   r-   start_positionend_positionis_impossibleqas_id)(r6   r4   r5   r   r   r	   answer_textfindloggerwarningr   appendlenr   r   r   encodequestion_textstrtypemax_lenZmax_len_single_sentenceZmax_len_sentences_pairZencode_plusZpadding_sider   Zpad_token_idindexZconvert_ids_to_tokensr   r!   Zcls_token_idnparrayZminimumwhereZsep_token_idSquadFeaturestolistr7   )*examplemax_seq_length
doc_stridemax_query_lengthis_trainingfeaturesr4   r5   Zactual_textZcleaned_answer_textZtok_to_orig_indexZorig_to_tok_indexZall_doc_tokensitokenZ
sub_tokensZ	sub_tokenZtok_start_positionZtok_end_positionZspansZtruncated_queryZsequence_added_tokensZsequence_pair_added_tokensZspan_doc_tokensZencoded_dictr+   Znon_padded_idsZlast_padding_id_positionr,   r-   rC   Zdoc_span_indexjZis_max_contextspan	cls_indexp_maskZspan_is_impossibleZ	doc_startZdoc_endZout_of_spanZ
doc_offsetr   r   r   !squad_convert_example_to_featuresV   s   



    
(
 


"rU   c                 C   s   | a d S N)r   )Ztokenizer_for_convertr   r   r   &squad_convert_example_to_features_init  s    rW   FTc	              	      s  g  t |t }t|t|fd>}	tt||||d}
tt|	j|
| ddt	| d| d W 5 Q R X g }d}d}t t	 d	| dD ]:}|sq|D ]"}||_
||_|| |d
7 }q|d
7 }q| ~|dkrt stdtjdd  D tjd}tjdd  D tjd}tjdd  D tjd}tjdd  D tjd}tjdd  D tjd}tjdd  D tjd}|stj|dtjd}t||||||}nJtjdd  D tjd}tjdd  D tjd}t||||||||} |fS |dkrt std fdd}tjtjtjtjtjdtjtjtjtjtjdf}tdgtdgtdgtg tg dtg tg tg tdgtg df}tjj|||S  S dS )a  
    Converts a list of examples into a list of features that can be directly given as input to a model.
    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.

    Args:
        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
        max_seq_length: The maximum sequence length of the inputs.
        doc_stride: The stride used when the context is too large and is split across several features.
        max_query_length: The maximum length of the query.
        is_training: whether to create features for model evaluation or model training.
        return_dataset: Default False. Either 'pt' or 'tf'.
            if 'pt': returns a torch.data.TensorDataset,
            if 'tf': returns a tf.data.Dataset
        threads: multiple processing threadsa-smi


    Returns:
        list of :class:`~transformers.data.processors.squad.SquadFeatures`

    Example::

        processor = SquadV2Processor()
        examples = processor.get_dev_examples(data_dir)

        features = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
        )
    )ZinitializerZinitargs)rJ   rK   rL   rM       )	chunksizez"convert squad examples to features)totaldescdisablei ʚ;r   zadd example index and unique idr
   ptz6PyTorch must be installed to return a PyTorch dataset.c                 S   s   g | ]
}|j qS r   )r*   .0fr   r   r   
<listcomp>_  s     z6squad_convert_examples_to_features.<locals>.<listcomp>)Zdtypec                 S   s   g | ]
}|j qS r   )r1   r^   r   r   r   ra   `  s     c                 S   s   g | ]
}|j qS r   )r0   r^   r   r   r   ra   a  s     c                 S   s   g | ]
}|j qS r   )rS   r^   r   r   r   ra   b  s     c                 S   s   g | ]
}|j qS r   )rT   r^   r   r   r   ra   c  s     c                 S   s   g | ]
}|j qS r   )r6   r^   r   r   r   ra   d  s     c                 S   s   g | ]
}|j qS r   )r4   r^   r   r   r   ra   l  s     c                 S   s   g | ]
}|j qS r   )r5   r^   r   r   r   ra   m  s     tfz<TensorFlow must be installed to return a TensorFlow dataset.c                  3   sJ   t  D ]<\} }|j|j|j| |jd|j|j|j|j|j	dfV  qd S )Nr*   r1   r0   Zfeature_indexr7   r4   r5   rS   rT   r6   )
r   r*   r1   r0   r7   r4   r5   rS   rT   r6   )rO   exrN   r   r   gen~  s    z/squad_convert_examples_to_features.<locals>.genrc   rd   N)r   r   r   rW   r   rU   listr   imapr=   r2   r3   r<   r   RuntimeErrortorchZtensorlongfloatZarangesizer   r   rb   Zint32Zint64stringZTensorShapedataZDatasetZfrom_generator)examplesr   rJ   rK   rL   rM   Zreturn_datasetthreadsZtqdm_enabledpZ	annotate_Znew_featuresr3   r2   Zexample_featuresZexample_featureZall_input_idsZall_attention_masksZall_token_type_idsZall_cls_indexZ
all_p_maskZall_is_impossibleZall_feature_indexdatasetZall_start_positionsZall_end_positionsrg   Ztrain_typesZtrain_shapesr   rf   r   "squad_convert_examples_to_features	  s    /   




     




ru   c                   @   sH   e Zd ZdZdZdZdddZdddZddd	Zdd
dZ	dd Z
dS )SquadProcessorz
    Processor for the SQuAD data set.
    Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
    NFc              	   C   s   |s8|d d d   d}|d d d   }g }n,dd t|d d |d d D }d }d }t|d   d|d	   d|d
   d|||d   d|dS )Nanswerstextr   utf-8answer_startc                 S   s(   g | ] \}}|  |  d dqS )ry   )rz   rx   )numpydecode)r_   r   rx   r   r   r   ra     s   z@SquadProcessor._get_example_from_tensor_dict.<locals>.<listcomp>idquestioncontexttitle)r7   r?   context_textr8   start_position_characterr   rw   )r{   r|   zipSquadExample)selftensor_dictevaluateanswerrz   rw   r   r   r   _get_example_from_tensor_dict  s$    z,SquadProcessor._get_example_from_tensor_dictc                 C   s@   |r|d }n|d }g }t |D ]}|| j||d q"|S )as  
        Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.

        Args:
            dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
            evaluate: boolean specifying if in evaluation mode or in training mode

        Returns:
            List of SquadExample

        Examples::

            import tensorflow_datasets as tfds
            dataset = tfds.load("squad")

            training_examples = get_examples_from_dataset(dataset, evaluate=False)
            evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
        Z
validationtrain)r   )r   r<   r   )r   rt   r   rq   r   r   r   r   get_examples_from_dataset  s    
z(SquadProcessor.get_examples_from_datasetc              	   C   sj   |dkrd}| j dkrtdttj||dkr6| j n|ddd}t|d }W 5 Q R X | |dS )	a  
        Returns the training examples from the data directory.

        Args:
            data_dir: Directory containing the data files used for training and evaluating.
            filename: None by default, specify this if the training file has a different name than the original one
                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.

        N NSquadProcessor should be instantiated via SquadV1Processor or SquadV2Processorrry   encodingrp   r   )	
train_file
ValueErroropenospathr   jsonload_create_examplesr   data_dirfilenamereader
input_datar   r   r   get_train_examples  s    

  z!SquadProcessor.get_train_examplesc              	   C   sj   |dkrd}| j dkrtdttj||dkr6| j n|ddd}t|d }W 5 Q R X | |dS )	a  
        Returns the evaluation example from the data directory.

        Args:
            data_dir: Directory containing the data files used for training and evaluating.
            filename: None by default, specify this if the evaluation file has a different name than the original one
                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
        Nr   r   r   ry   r   rp   dev)	dev_filer   r   r   r   r   r   r   r   r   r   r   r   get_dev_examples  s    	
  zSquadProcessor.get_dev_examplesc                 C   s   |dk}g }t |D ]}|d }|d D ]}|d }|d D ]}	|	d }
|	d }d }d }g }d|	krn|	d }nd	}|s|r|	d
 d }|d }|d }n|	d
 }t|
|||||||d}|| q<q(q|S )Nr   r   Z
paragraphsr   Zqasr}   r~   r6   Frw   r   rx   rz   )r7   r?   r   r8   r   r   r6   rw   )r   r   r<   )r   r   set_typerM   rq   entryr   Z	paragraphr   Zqar7   r?   r   r8   rw   r6   r   rI   r   r   r   r   $  sB    

zSquadProcessor._create_examples)F)F)N)N)__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   r   r   r   rv     s   



rv   c                   @   s   e Zd ZdZdZdS )SquadV1Processorztrain-v1.1.jsonzdev-v1.1.jsonNr   r   r   r   r   r   r   r   r   r   N  s   r   c                   @   s   e Zd ZdZdZdS )SquadV2Processorztrain-v2.0.jsonzdev-v2.0.jsonNr   r   r   r   r   r   S  s   r   c                   @   s   e Zd ZdZg dfddZdS )r   aT  
    A single training/test example for the Squad dataset, as loaded from disk.

    Args:
        qas_id: The example's unique identifier
        question_text: The question string
        context_text: The context string
        answer_text: The answer string
        start_position_character: The character position of the start of the answer
        title: The title of the example
        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
        is_impossible: False by default, set to True if the example has no possible answer.
    Fc	                 C   s   || _ || _|| _|| _|| _|| _|| _d\| _| _g }	g }
d}| jD ]H}t	|rZd}n$|rj|	
| n|	d  |7  < d}|

t|	d  qH|	| _|
| _|d k	r|s|
| | _|
t|t| d t|
d  | _d S )N)r   r   Tr   Fr
   )r7   r?   r   r8   r   r6   rw   r4   r5   r'   r<   r=   r   char_to_word_offsetr   )r   r7   r?   r   r8   r   r   rw   r6   r   r   Zprev_is_whitespacer&   r   r   r   __init__g  s4    

zSquadExample.__init__Nr   r   r   r   r   r   r   r   r   r   X  s   r   c                   @   s    e Zd ZdZdedddZdS )rG   a)  
    Single squad example features to be fed to a model.
    Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
    using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
        cls_index: the index of the CLS token.
        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
        example_index: the index of the example
        unique_id: The unique Feature identifier
        paragraph_len: The length of the context
        token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
            If a token does not have their maximum context in this feature object, it means that another feature object
            has more information related to that token and should be prioritized over this feature for that token.
        tokens: list of tokens corresponding to the input ids
        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
        start_position: start of the answer token index
        end_position: end of the answer token index
    N)r7   c                 C   s^   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _d S rV   )r*   r1   r0   rS   rT   r2   r3   r+   r.   r,   r-   r4   r5   r6   r7   )r   r*   r1   r0   rS   rT   r2   r3   r+   r.   r,   r-   r4   r5   r6   r7   r   r   r   r     s    zSquadFeatures.__init__)N)r   r   r   r   r@   r   r   r   r   r   rG     s
   ( rG   c                   @   s   e Zd ZdZdddZdS )SquadResultaJ  
    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.

    Args:
        unique_id: The unique identifier corresponding to that example.
        start_logits: The logits corresponding to the start of the answer
        end_logits: The logits corresponding to the end of the answer
    Nc                 C   s,   || _ || _|| _|r(|| _|| _|| _d S rV   )start_logits
end_logitsr3   start_top_indexend_top_index
cls_logits)r   r3   r   r   r   r   r   r   r   r   r     s    zSquadResult.__init__)NNNr   r   r   r   r   r     s   	r   )Fr
   T)(r   loggingr   	functoolsr   multiprocessingr   r   r{   rD   r   Z
file_utilsr   r   Ztokenization_bertr	   utilsr   rk   Ztorch.utils.datar   Z
tensorflowrb   	getLoggerr   r:   r   r    r!   r'   rU   rW   ru   rv   r   r   objectr   rG   r   r   r   r   r   <module>   sD   
 /   
 1 ?>