U
    Jºc03  ã                   @   s˜  U d dl Z d dlmZmZmZ d dlZd dlmZ d dl	m  m
  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ er–d dlmZ i Zeeef ed	< d
d„ ZG dd„ dejƒZddd„Zeejjƒddd„ƒZeejƒddd„ƒZ eej!ƒddd„ƒZ"eejj#j$eƒ eejj%j$eƒ eejj&j$eƒ eejj'j$eƒ eejj(eƒ eejj)eƒ eejj*j$eƒ eejj+eƒ eejj,eƒ dS )é    N)ÚCallableÚDictÚTYPE_CHECKING)Údistributed_c10d)Úreduce_scatter)Ú_register_default_op)Ú_decorator_func)Útree_map©ÚShardedTensorÚ_PARTIAL_TENSOR_OPSc                 C   s   t jt| tdS )zÉ
    Decorate for custom partial tensor op
    Args:
        func(Callable): Torch function for which we want to provide a PartialTensor
            implementation (ex: torch.nn.functional.linear)
    )ÚopZop_table)Ú	functoolsÚpartialr   r   )Úfunc© r   úK/tmp/pip-unpacked-wheel-gikjz4vx/torch/distributed/_shard/partial_tensor.pyÚ_custom_partial_tensor_op   s
    ýr   c                       s’   e Zd ZU dZejed< ejed< ej	ed< dddgZ
dej	jfdd„Zdd	„ Zejd
dœdd„Zeddd„ƒZeddd„ƒZ‡ fdd„Z‡  ZS )Ú_PartialTensora;  
    PartialTensor is an abstraction to represent Tensors that need
    aggregation across multiple devices and multiple processes.

    PartialTensor is initialized in an SPMD like fashion where each rank
    initializes the PartialTensor. The PartialTensor object on each rank
    then only stores the local partial shard, process group and the
    aggregation way to get a full tensor.

    PartialTensor doesn't provide any Tensor like operations but is a
    wrapper providing the Tensor representing the local partial shard.

    We assume the size of each local tensor to be exactly the same.

    Users can apply custom distributed sharded computations on top of
    this primitive.

    Args:
        local_partial_shard (Tensor): Partial result stored across ranks.
        process_group (ProcessGroup): The process group to aggregate on.
        reduce_op (distributed_c10d.ReduceOp): Way to aggregate the partial result.
            Default: ``distributed_c10d.ReduceOp.SUM``

    Examples:
        >>> # All tensors below are of torch.int64 type.
        >>> # We have 2 process groups, 2 ranks.
        >>> # xdoctest: +SKIP
        >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
        >>> tensor = torch.cat([tensor, tensor + 2])
        >>> tensor
        tensor([1, 2, 3, 4]) # Rank 0
        tensor([3, 4, 5, 6]) # Rank 1
        >>> partial_tensor = _PartialTensor(tensor, distributed_c10d.ReduceOp.MAX)
        >>> sharding_dim = 0
        >>> collect_spec = shard_spec.ChunkShardingSpec(
                dim=sharding_dim,
                placements=[
                    "rank:0/cuda:0",
                    "rank:1/cuda:1",
                ],
            )
        >>> complete_tensor = partial_tensor.reshard(collect_spec)
        >>> complete_tensor
        ShardedTensor(
            ShardedTensorMetadata(
                shards_metadata=[
                    ShardMetadata(shard_offsets=[0], shard_sizes=[2], placement=rank:0/cuda:0),
                    ShardMetadata(shard_offsets=[2], shard_sizes=[2], placement=rank:1/cuda:1)],
                size=torch.Size([4])
        )
        >>> complete_tensor.local_tensor()
        tensor([3, 4]) # Rank 0
        tensor([5, 6]) # Rank 1

        >>> # All tensors below are of torch.cfloat type.
        >>> # We have 2 process groups, 2 ranks.
        >>> tensor = torch.tensor([1, 2]) + 2 * rank
        >>> tensor = torch.cat([tensor, tensor + 2])
        >>> tensor
        tensor([1, 2, 3, 4]) # Rank 0
        tensor([3, 4, 5, 6]) # Rank 1
        >>> partial_tensor = _PartialTensor(tensor)
        >>> complete_tensor = partial_tensor.reshard(collect_spec)
        >>> complete_tensor
        ShardedTensor(
            ShardedTensorMetadata(
                shards_metadata=[
                    ShardMetadata(shard_offsets=[0], shard_sizes=[2], placement=rank:0/cuda:0),
                    ShardMetadata(shard_offsets=[2], shard_sizes=[2], placement=rank:1/cuda:1)],
                size=torch.Size([4])
        )
        >>> complete_tensor.local_tensor()
        tensor([4, 6]) # Rank 0
        tensor([8, 10]) # Rank 1
    Ú_process_groupÚ_local_shardÚ
_reduce_opNc                 C   sL   t jj| | ¡ |j|j| ¡ |jd}|d k	r2|nt 	¡ |_
||_||_|S )N)ÚdtypeÚlayoutZ
pin_memoryÚrequires_grad)ÚtorchÚTensorZ_make_wrapper_subclassÚsizer   r   Ú	is_pinnedr   r   Z_get_default_groupr   r   r   )ÚclsÚlocal_shardÚprocess_groupÚ	reduce_opÚrr   r   r   Ú__new__w   s    ú	ÿýz_PartialTensor.__new__c                 C   s   t | jtjƒstdƒ‚d S )Nz<reduce_op needs to be a member of distributed_c10d.ReduceOp.)Ú
isinstancer   r   ÚReduceOpÚ
ValueError©Úselfr   r   r   Ú__post_init__ˆ   s    ÿz_PartialTensor.__post_init__r   )Úresharding_specÚreturnc                    s¸  ddl m} t|tjƒs tdƒ‚| j ¡ r2tdƒ‚t|j	ƒ}| j 
|¡| j 
¡  }| j}|dkrœdg| 	¡ d  }| j 
¡ | |d< tjj |t|ƒdd¡}t | j¡}d}d	}	dg| j 
¡  }
t|jƒD ]4\}}| ¡ |krâ|}| ¡ |kròd
}	||
| ¡ < qÊ|j| j 
¡ |d‰ |	r,‡ fdd„|
D ƒ‰ tt ˆ d ¡tˆ ƒ| j| jd}| j 
¡ }|dkr¤| jj| j 
¡ |d}||  
¡ }| 
¡ |kr¤| |d|| ¡}|j|||| jdS )au  
        The reshard happens in two steps logically:

        1. Aggregate all the shards of the partial tensor.
        2. Shard this tensor according to the provided spec.

        In reality, for the sake of performance, we consolidate all partial tensors
        across multiple ranks and covert to a sharded tensor in one step.

        Args:
            resharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
                The specification describing how we reshard the aggregated local result.

        Returns:
            A :class:`ShardedTensor` filled with local aggregated result.
        r   r
   z-Only ChunkShardingSpec supported for reshard.z/Only real partial tensor supported for reshard.é   éÿÿÿÿZconstantNFT)Údimc                    s   g | ]}ˆ | ‘qS r   r   )Ú.0Úidx©Úlocal_shardsr   r   Ú
<listcomp>À   s     z*_PartialTensor.reshard.<locals>.<listcomp>)r   Úgroup©r!   )Z+torch.distributed._shard.sharded_tensor.apir   r%   Ú
shard_specZChunkShardingSpecÚNotImplementedErrorr   Z
is_complexÚintr/   r   r   r   ÚnnZ
functionalÚpadÚtupleÚdistZget_rankÚ	enumerateZ
placementsZrankÚchunkr   Z
empty_likeÚlistr   ZnarrowZ_init_from_local_tensor)r)   r+   r   Zsharding_dimZchunk_mode_resr    ÚpaddingZcurrent_rankZrank_idxZrearrange_local_shardsÚindicesr1   Z	placementZlocal_resultZsharded_tensor_sizeZuneven_local_shardsZexpected_sizer   r2   r   ÚreshardŽ   sn    

üü

 ÿýüz_PartialTensor.reshardr   c              	      sŠ   d ‰ ‡ fdd„}t ||ƒ t ||ƒ |tkr>t| |||ˆ ƒS tj ¡ }z8tj ¡ $ td|j› d|› d|› dƒ‚W 5 Q R X W 5 ~X d S )Nc                    s   ˆ d krt | tƒr| j‰ d S ©N)r%   r   r   )Úer6   r   r   Úfind_process_groupá   s    z=_PartialTensor.__torch_function__.<locals>.find_process_groupztorch function 'z', with args: z and kwargs: z! not supported for PartialTensor!)r	   r   r   Z_CZ_DisableTorchDispatchZDisableTorchFunctionÚRuntimeErrorÚ__name__)r   r   ÚtypesÚargsÚkwargsrF   Zguardr   r6   r   Ú__torch_function__Ü   s    


ÿz!_PartialTensor.__torch_function__c                 C   s&   t d| j› d|j› d|j› dƒ‚d S )NzA z- object is being used from c++ while calling Ú.zE but the there is no custom __torch_dispatch__ implementation for it.)rG   rH   Ú
__module__)r   r   rI   rJ   rK   r   r   r   Ú__torch_dispatch__ö   s    ÿz!_PartialTensor.__torch_dispatch__c                    s   dt t| ƒ ¡ › dS )NzPartialTensor(ú))Úsuperr   Ú__repr__r(   ©Ú	__class__r   r   rR   þ   s    z_PartialTensor.__repr__)r   N)r   N)rH   rN   Ú__qualname__Ú__doc__r   ZProcessGroupÚ__annotations__r   r   r&   Ú	__slots__ZSUMr$   r*   r7   ZShardingSpecrC   ÚclassmethodrL   rO   rR   Ú__classcell__r   r   rS   r   r   $   s   
L


Nr   r   c                 C   s6   |d }|j }|d }|d }tt |||¡||jƒS )Nr   é   r-   )r   r   r   Ú	transposer   )rI   rJ   rK   r!   Zpartial_tensorÚinputZdim0Zdim1r   r   r   Ú_transpose_impl  s    ýr^   c                 C   s   t | |||ƒS rD   ©r^   ©rI   rJ   rK   r!   r   r   r   Úpartial_transpose  s    ra   c                 C   s   t | |||ƒS rD   r_   r`   r   r   r   Úpartial_torch_transpose  s    rb   c           
      C   sº   |d }t |ƒdkrtdƒ‚g }t|ƒD ]H\}}t|tƒsBtdƒ‚|dkrR|j}n||jkrdtdƒ‚| |j¡ q(|d kr€d}	n$d|krtdƒ‚d|kr |d nd}	tt 	||	¡||jƒS )Nr   z#Empty list of tensors to torch.cat!z3All inputs need to be an instance of _PartialTensorz\All _PartialTensor reduce_ops need to be the same, found: {reduce_op} and {input._reduce_op}Úoutz"out" kwarg is not supported!r/   )
ÚlenrG   r>   r%   r   r   Úappendr   r   Úcat)
rI   rJ   rK   r!   Z
input_listr3   r1   r]   r"   r/   r   r   r   Úpartial_cat  s(    

ÿrg   )r   NN)r   NN)r   NN)r   NN)-r   Útypingr   r   r   r   Ztorch.distributedZdistributedr=   Z&torch.distributed._shard.sharding_specZ_shardZsharding_specr7   r   Ztorch.distributed.nn.functionalr   Z(torch.distributed._shard.common_op_utilsr   Z*torch.distributed._shard.op_registry_utilsr   Ztorch.utils._pytreer	   Z'torch.distributed._shard.sharded_tensorr   r   rW   r   r   r   r^   r\   ra   rb   rf   rg   r   Ú__get__Úshaper   r   r   r/   ÚndimZis_contiguousÚ
contiguousr   r   r   r   Ú<module>   s@     ^

