U
    &cM                     @   sz   d dl Z d dlZd dlZd dlZd dlZd dlmZ ddlmZ ddl	m
Z
 e eZG dd deZG dd	 d	eZdS )
    N)Dataset   )PreTrainedTokenizer)torch_distributed_zero_firstc                   @   s<   e Zd ZdZdeeedddZdd Ze	j
d	d
dZdS )TextDatasetL
    This will be superseded by a framework-agnostic approach
    soon.
    F	tokenizer	file_path
block_sizec              
   C   s  t j|st||jdd }t j|\}}t j|d|jj	t
||}t|* t j|r|st }	t|d}
t|
| _W 5 Q R X td| dt |	  ntd|  g | _t|dd	}| }W 5 Q R X |||}td
t|| d |D ]$}| j|||||   qt }	t|d}
tj| j|
tjd W 5 Q R X td|t |	  W 5 Q R X d S )NF)pairzcached_lm_{}_{}_{}rbz"Loading features from cached file z [took %.3f s]z'Creating features from dataset file at utf-8encodingr      wb)protocolz1Saving features into cached file %s [took %.3f s])ospathisfileAssertionErrorZnum_special_tokens_to_addsplitjoinformat	__class____name__strr   existstimeopenpickleloadexamplesloggerinforeadZconvert_tokens_to_idstokenizerangelenappendZ build_inputs_with_special_tokensdumpHIGHEST_PROTOCOL)selfr
   r   r   Zoverwrite_cache
local_rank	directoryfilenameZcached_features_filestarthandleftextZtokenized_texti r7   P/tmp/pip-unpacked-wheel-ymerj3tt/transformers/data/datasets/language_modeling.py__init__   sB     
 
  
zTextDataset.__init__c                 C   s
   t | jS Nr*   r$   r.   r7   r7   r8   __len__F   s    zTextDataset.__len__returnc                 C   s   t j| j| t jdS N)ZdtypetorchZtensorr$   longr.   r6   r7   r7   r8   __getitem__I   s    zTextDataset.__getitem__N)Fr   r   
__module____qualname____doc__r   r   intr9   r=   rB   ZTensorrE   r7   r7   r7   r8   r      s        0r   c                   @   s<   e Zd ZdZdeeedddZdd Ze	j
dd	d
ZdS )LineByLineTextDatasetr   r   r	   c              	   C   sh   t j|sttd| t|dd}dd |  D }W 5 Q R X |j	|d|d}|d | _
d S )	Nz)Creating features from dataset file at %sr   r   c                 S   s$   g | ]}t |d kr| s|qS )r   )r*   isspace).0liner7   r7   r8   
<listcomp>[   s       z2LineByLineTextDataset.__init__.<locals>.<listcomp>T)Zadd_special_tokens
max_lengthZ	input_ids)r   r   r   r   r%   r&   r!   r'   
splitlinesZbatch_encode_plusr$   )r.   r
   r   r   r/   r4   linesZbatch_encodingr7   r7   r8   r9   S   s     zLineByLineTextDataset.__init__c                 C   s
   t | jS r:   r;   r<   r7   r7   r8   r=   `   s    zLineByLineTextDataset.__len__r>   c                 C   s   t j| j| t jdS r@   rA   rD   r7   r7   r8   rE   c   s    z!LineByLineTextDataset.__getitem__N)r   rF   r7   r7   r7   r8   rK   M   s   rK   )loggingr   r"   r    rB   Ztorch.utils.data.datasetr   Ztokenization_utilsr   Ztrainerr   	getLoggerr   r%   r   rK   r7   r7   r7   r8   <module>   s   
=