U
    <ºc(2  ã                   @   sà   d dl mZ d dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZmZmZmZmZmZmZ dddd	gZed
ddZedƒG dd„ deƒƒZedƒG dd„ dee ƒƒZedƒG dd	„ d	eƒƒZedƒG dd„ dee ƒƒZdS )é    )Údefaultdict)Úfunctional_datapipe)ÚIterDataPipeÚ	DataChunk)Ú_check_unpickable_fn)ÚAnyÚCallableÚDefaultDictÚIteratorÚListÚOptionalÚSizedÚTypeVarÚBatcherIterDataPipeÚGrouperIterDataPipeÚShardingFilterIterDataPipeÚUnBatcherIterDataPipeÚT_coT)Ú	covariantZsharding_filterc                   @   s>   e Zd ZdZedœdd„Zdd„ Zdd„ Zd	d
„ Zdd„ Z	dS )r   an  
    Wrapper that allows DataPipe to be sharded (functional name: ``sharding_filter``). After ``apply_sharding`` is
    called, each instance of the DataPipe (on different workers) will have every `n`-th element of the
    original DataPipe, where `n` equals to the number of instances.

    Args:
        source_datapipe: Iterable DataPipe that will be sharded
    )Úsource_datapipec                 C   s   || _ d| _d| _d S )Né   r   )r   Únum_of_instancesÚinstance_id)Úselfr   © r   úL/tmp/pip-unpacked-wheel-gikjz4vx/torch/utils/data/datapipes/iter/grouping.pyÚ__init__   s    z#ShardingFilterIterDataPipe.__init__c                 C   s   dS )NTr   ©r   r   r   r   Úis_shardable!   s    z'ShardingFilterIterDataPipe.is_shardablec                 C   s   || _ || _d S ©N)r   r   )r   r   r   r   r   r   Úapply_sharding$   s    z)ShardingFilterIterDataPipe.apply_shardingc                 c   s.   t | jƒD ]\}}|| j | jkr
|V  q
d S r   )Ú	enumerater   r   r   )r   ÚiÚitemr   r   r   Ú__iter__(   s    z#ShardingFilterIterDataPipe.__iter__c                 C   sR   t | jtƒr:t| jƒ| j | jt| jƒ| j k r4dnd S td t| ƒj	¡ƒ‚d S )Nr   r   ú%{} instance doesn't have valid length)
Ú
isinstancer   r   Úlenr   r   Ú	TypeErrorÚformatÚtypeÚ__name__r   r   r   r   Ú__len__-   s
    ÿz"ShardingFilterIterDataPipe.__len__N)
r+   Ú
__module__Ú__qualname__Ú__doc__r   r   r   r    r$   r,   r   r   r   r   r      s   Úbatchc                       sx   e Zd ZU dZeed< eed< eed< ee ed< de	feeeddœ‡ fd	d
„Z
ee	 dœdd„Zedœdd„Z‡  ZS )r   a1  
    Creates mini-batches of data (functional name: ``batch``). An outer dimension will be added as
    ``batch_size`` if ``drop_last`` is set to ``True``, or ``length % batch_size`` for the
    last batch if ``drop_last`` is set to ``False``.

    Args:
        datapipe: Iterable DataPipe being batched
        batch_size: The size of each batch
        drop_last: Option to drop the last batch if it's not full
        wrapper_class: wrapper to apply onto each batch (type ``List``) before yielding,
            defaults to ``DataChunk``

    Example:
        >>> # xdoctest: +SKIP
        >>> from torchdata.datapipes.iter import IterableWrapper
        >>> dp = IterableWrapper(range(10))
        >>> dp = dp.batch(batch_size=3, drop_last=True)
        >>> list(dp)
        [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
    ÚdatapipeÚ
batch_sizeÚ	drop_lastÚlengthFN)r1   r2   r3   Úreturnc                    s<   |dkst dƒ‚tƒ  ¡  || _|| _|| _d | _|| _d S )Nr   z+Batch size is required to be larger than 0!)ÚAssertionErrorÚsuperr   r1   r2   r3   r4   Úwrapper_class)r   r1   r2   r3   r8   ©Ú	__class__r   r   r   O   s    
zBatcherIterDataPipe.__init__©r5   c                 c   sZ   g }| j D ],}| |¡ t|ƒ| jkr
|  |¡V  g }q
t|ƒdkrV| jsV|  |¡V  d S ©Nr   )r1   Úappendr'   r2   r8   r3   )r   r0   Úxr   r   r   r$   ]   s    

zBatcherIterDataPipe.__iter__c                 C   sp   | j d k	r| j S t| jtƒrX| jr6t| jƒ| j | _ nt| jƒ| j d | j | _ | j S td t	| ƒj
¡ƒ‚d S )Nr   r%   )r4   r&   r1   r   r3   r'   r2   r(   r)   r*   r+   r   r   r   r   r,   h   s    
zBatcherIterDataPipe.__len__)r+   r-   r.   r/   r   Ú__annotations__ÚintÚboolr   r   r   r
   r$   r,   Ú__classcell__r   r   r9   r   r   4   s   
üûZunbatchc                   @   s2   e Zd ZdZdeedœdd„Zdd„ Zdd	„ Zd
S )r   a   
    Undoes batching of data (functional name: ``unbatch``). In other words, it flattens the data up to the specified level
    within a batched DataPipe.

    Args:
        datapipe: Iterable DataPipe being un-batched
        unbatch_level: Defaults to ``1`` (only flattening the top level). If set to ``2``,
            it will flatten the top two levels, and ``-1`` will flatten the entire DataPipe.

    Example:
        >>> # xdoctest: +SKIP
        >>> from torchdata.datapipes.iter import IterableWrapper
        >>> source_dp = IterableWrapper([[[0, 1], [2]], [[3, 4], [5]], [[6]]])
        >>> dp1 = source_dp.unbatch()
        >>> list(dp1)
        [[0, 1], [2], [3, 4], [5], [6]]
        >>> dp2 = source_dp.unbatch(unbatch_level=2)
        >>> list(dp2)
        [0, 1, 2, 3, 4, 5, 6]
    r   ©r1   Úunbatch_levelc                 C   s   || _ || _d S r   rC   )r   r1   rD   r   r   r   r   ‹   s    zUnBatcherIterDataPipe.__init__c                 c   s,   | j D ] }| j|| jdD ]
}|V  qqd S )N©rD   )r1   Ú_diverD   )r   Úelementr"   r   r   r   r$   ‘   s    
zUnBatcherIterDataPipe.__iter__c                 c   s¾   |dk rt dƒ‚|dkrZt|tƒs,t|tƒrR|D ]}| j|ddD ]
}|V  qBq0qº|V  n`|dkrj|V  nPt|tƒs~t|tƒr¨|D ]"}| j||d dD ]
}|V  q˜q‚ntd| j› dƒ‚d S )Néÿÿÿÿz unbatch_level must be -1 or >= 0rE   r   r   zunbatch_level z" exceeds the depth of the DataPipe)Ú
ValueErrorr&   Úlistr   rF   Ú
IndexErrorrD   )r   rG   rD   r#   r"   r   r   r   rF   –   s    zUnBatcherIterDataPipe._diveN)r   )	r+   r-   r.   r/   r   r@   r   r$   rF   r   r   r   r   r   t   s    þþÚgroupbyc                   @   sv   e Zd ZdZdddddœee eeee ee e	dœdd„Z
d	d
„ Zdd„ Zddœdd„Zdd„ Zdd„ Zdd„ ZdS )r   a«	  
    Groups data from input IterDataPipe by keys which are generated from ``group_key_fn``,
    and yields a ``DataChunk`` with batch size up to ``group_size`` if defined (functional name: ``groupby``).

    The samples are read sequentially from the source ``datapipe``, and a batch of samples belonging to the same group
    will be yielded as soon as the size of the batch reaches ``group_size``. When the buffer is full,
    the DataPipe will yield the largest batch with the same key, provided that its size is larger
    than ``guaranteed_group_size``. If its size is smaller, it will be dropped if ``drop_remaining=True``.

    After iterating through the entirety of source ``datapipe``, everything not dropped due to the buffer capacity
    will be yielded from the buffer, even if the group sizes are smaller than ``guaranteed_group_size``.

    Args:
        datapipe: Iterable datapipe to be grouped
        group_key_fn: Function used to generate group key from the data of the source datapipe
        buffer_size: The size of buffer for ungrouped data
        group_size: The max size of each group, a batch is yielded as soon as it reaches this size
        guaranteed_group_size: The guaranteed minimum group size to be yielded in case the buffer is full
        drop_remaining: Specifies if the group smaller than ``guaranteed_group_size`` will be dropped from buffer
            when the buffer is full

    Example:
        >>> import os
        >>> # xdoctest: +SKIP
        >>> from torchdata.datapipes.iter import IterableWrapper
        >>> def group_fn(file):
        ...    return os.path.basename(file).split(".")[0]
        >>> source_dp = IterableWrapper(["a.png", "b.png", "a.json", "b.json", "a.jpg", "c.json"])
        >>> dp0 = source_dp.groupby(group_key_fn=group_fn)
        >>> list(dp0)
        [['a.png', 'a.json', 'a.jpg'], ['b.png', 'b.json'], ['c.json']]
        >>> # A group is yielded as soon as its size equals to `group_size`
        >>> dp1 = source_dp.groupby(group_key_fn=group_fn, group_size=2)
        >>> list(dp1)
        [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
        >>> # Scenario where `buffer` is full, and group 'a' needs to be yielded since its size > `guaranteed_group_size`
        >>> dp2 = source_dp.groupby(group_key_fn=group_fn, buffer_size=3, group_size=3, guaranteed_group_size=2)
        >>> list(dp2)
        [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
    i'  NF)Úbuffer_sizeÚ
group_sizeÚguaranteed_group_sizeÚdrop_remaining)r1   Úgroup_key_fnrM   rN   rO   rP   c                C   s¦   t |ƒ || _|| _|| _ttƒ| _d| _|| _d | _	|d k	rf|d k	rfd|  k rZ|ks`n t
‚|| _	|d k	r–|d k	rŒd|  k rŠ|ksn t
‚|| _	|| _t| _d S r<   )r   r1   rQ   Úmax_buffer_sizer   rJ   Úbuffer_elementsÚcurr_buffer_sizerN   rO   r6   rP   r   r8   )r   r1   rQ   rM   rN   rO   rP   r   r   r   r   Õ   s     
"zGrouperIterDataPipe.__init__c                 C   s¦   d }d}d }| j  ¡ D ](}t| j | ƒ|krt| j | ƒ}|}q| jd k	rn|| jk rn| jsntdt| j | ƒƒ‚| jd ks‚|| jkrŒ| j | }|  j|8  _| j |= |S )Nr   zFailed to group items)rS   Úkeysr'   rO   rP   ÚRuntimeErrorÚstrrT   )r   Zbiggest_keyZbiggest_sizeÚresult_to_yieldZfindkeyr   r   r   Ú_remove_biggest_keyï   s    
z'GrouperIterDataPipe._remove_biggest_keyc                 c   sî   | j D ]¤}|  |¡}| j|  |¡ |  jd7  _| jd k	r‚| jt| j| ƒkr‚|  | j| ¡V  |  jt| j| ƒ8  _| j|= | j| jkr|  	¡ }|d k	r|  |¡V  qt
| j ¡ ƒD ].}| j |¡}|  jt|ƒ8  _|  |¡V  qºd S )Nr   )r1   rQ   rS   r=   rT   rN   r'   r8   rR   rY   ÚtuplerU   Úpop)r   r>   ÚkeyrX   Úresr   r   r   r$     s     

zGrouperIterDataPipe.__iter__r;   c                 C   s   d| _ ttƒ| _d S r<   )rT   r   rJ   rS   r   r   r   r   Úreset  s    zGrouperIterDataPipe.resetc              	   C   s@   | j | j| j| j| j| j| j| j| jf	}t	j
d k	r<t	 
|¡S |S r   )r1   rQ   rR   rN   rO   rP   r8   Ú_valid_iterator_idÚ_number_of_samples_yieldedr   Zgetstate_hook©r   Ústater   r   r   Ú__getstate__  s    ÷

z GrouperIterDataPipe.__getstate__c              
   C   s<   |\	| _ | _| _| _| _| _| _| _| _d| _	t
tƒ| _d S r<   )r1   rQ   rR   rN   rO   rP   r8   r_   r`   rT   r   rJ   rS   ra   r   r   r   Ú__setstate__-  s    öz GrouperIterDataPipe.__setstate__c                 C   s   | j  ¡  d S r   )rS   Úclearr   r   r   r   Ú__del__<  s    zGrouperIterDataPipe.__del__)r+   r-   r.   r/   r   r   r   r@   r   rA   r   rY   r$   r^   rc   rd   rf   r   r   r   r   r   «   s&   ,ùùN)Úcollectionsr   Z%torch.utils.data.datapipes._decoratorr   Z#torch.utils.data.datapipes.datapiper   r   Z'torch.utils.data.datapipes.utils.commonr   Útypingr   r   r	   r
   r   r   r   r   Ú__all__r   r   r   r   r   r   r   r   r   Ú<module>   s$   (ü!?6