U
    Jºc‰  ã                   @   sÄ   d dl Z d dlZd dlmZ G dd„ deƒZG dd„ deƒZeejdœdd„Z	eejdœd	d
„Z
eejejdœdd„Zejeejejdœdd„Zdeejejdœdd„Zdeejejdœdd„ZdS )é    Nc                   @   s<   e Zd ZdZddddgZejdœdd„Zee	d	œd
d„Z
dS )ÚDefaultStatez¼
    Stores state needed to perform the default communication algorithm
    within a communication hook.

    Args:
        process_group (ProcessGroup): The process group to be used.
    Úprocess_groupÚ
world_sizeÚgradient_predivide_factorÚgradient_postdivide_factor)r   c                 C   sJ   |d krt d| › dƒ‚|| _t |¡| _|  | j¡| _| j| j | _d S )Nz0Expected to pass in an explicit ProcessGroup to Ú.)Ú
ValueErrorr   ÚdistZget_world_sizer   Ú_get_gradient_predivide_factorr   r   )Úselfr   © r   úZ/tmp/pip-unpacked-wheel-gikjz4vx/torch/distributed/algorithms/_comm_hooks/default_hooks.pyÚ__init__   s    ÿzDefaultState.__init__)r   Úreturnc                 C   s.   d}|| dkr&|| |kr&|d9 }qt |ƒS )Né   r   é   )Úfloat)r   r   Zfactorr   r   r   r
   %   s    
z+DefaultState._get_gradient_predivide_factorN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú	__slots__r	   ZProcessGroupr   Úintr   r
   r   r   r   r   r      s   	ü	þr   c                       s,   e Zd ZdZdgZejf‡ fdd„	Z‡  ZS )ÚLowPrecisionStateaØ  
    Stores state needed to perform gradient communication in a lower precision
    within a communication hook. Communication hook will cast gradients back
    to the original parameter precision specified by ``parameter_type`` (default: torch.float32).
    Builds on top of the :class:`DefaultState`.

    Args:
        parameter_type (torch.dtype): The precision of model's parameters.
        Required for a hook to cast gradients back to a parameter's precision.
    Úparameter_typec                    s   t ƒ  |¡ || _d S ©N)Úsuperr   r   )r   r   r   ©Ú	__class__r   r   r   ;   s    zLowPrecisionState.__init__)	r   r   r   r   r   ÚtorchZfloat32r   Ú__classcell__r   r   r   r   r   +   s
   ÿýr   ©ÚstateÚgradc                 C   s*   |j }|j  | j¡|_ | tj ¡ ¡ dS )zu
    Casts gradients back to full parameter precision so that
    further computation happens in full precision.
    N)ÚdataÚtor   Zrecord_streamr   ZcudaZcurrent_stream)r"   r#   Zorig_grad_datar   r   r   Ú_decompressD   s    r&   c                 C   s@   | j dkr| | j ¡ tj|| jd | jdkr<| | j¡ dS )aR  
    This FSDP communication hook implements ``all_reduce`` algorithm
    and a necessary pre- and post-division of gradients.

    Args:
        state (DefaultState): State information, configures pre- and post-division factors.
        grad (torch.Tensor): A gradient for the local batch that needs to be communicated across ranks.
    r   ©ÚgroupN)r   Údiv_r	   Z
all_reducer   r   r!   r   r   r   Úallreduce_hookN   s
    

r*   ©r"   r#   Úoutputc                 C   sB   | j dkr| | j ¡ tj||| jd | jdkr>| | j¡ dS )aä  
    This FSDP communication hook implements ``reduce_scatter`` algorithm for
    sharded FSDP strategies and a necessary pre- and post-division of gradients.

    Args:
        state (DefaultState): State information, configures pre- and post-division factors.
        grad (torch.Tensor): An unsharded gradient for the local batch that needs to be
        communicated across ranks.
        output (torch.Tensor): Stores a single shard of the gradient after ``reduce_scatter``.
    r   r'   N)r   r)   r	   Z_reduce_scatter_baser   r   r+   r   r   r   Úreduce_scatter_hooka   s    
  ÿ
r-   ©Úprecr"   r#   r,   c                 C   sT   |j  | ¡|_ |d k	r<|j  | ¡|_ t|||ƒ t||ƒ nt||ƒ t||ƒ d S r   )r$   r%   r-   r&   r*   r.   r   r   r   Ú_low_precision_hookv   s    
r0   c                 C   s   t  ttj¡}|| ||ƒS ©aƒ  
    This FSDP communication hook implements a simple gradient compression
    approach that casts ``grad`` to half-precision floating-point format (``torch.float16``).
    It also averages gradients by ``world_size`` in two steps: first it pre-divides gradients by a
    ``state.gradient_predivide_factor``, and after a communication step (``all_reduce`` or ``reduce_scatter``)
    gradients are averaged by a ``state.gradient_postdivide_factor``.
    Once post-division is done, compressed gradients are casted back to parameters' precision.

    Args:
        state (LowPrecisionState): State information, configures pre- and post-division factors, parameters' precision.
        grad (torch.Tensor): A gradient for the local batch that needs to be communicated across ranks in a lower precision.
        output (torch.Tensor): Stores a single shard of the gradient after ``reduce_scatter``.
    )Ú	functoolsÚpartialr0   r   Zfloat16)r"   r#   r,   Z	fp16_hookr   r   r   Úfp16_compress_hook€   s    r4   c                 C   s   t  ttj¡}|| ||ƒS r1   )r2   r3   r0   r   Zbfloat16)r"   r#   r,   Z	bf16_hookr   r   r   Úbf16_compress_hook‘   s    r5   )N)N)r2   r   Ztorch.distributedZdistributedr	   Úobjectr   r   ZTensorr&   r*   r-   Zdtyper0   r4   r5   r   r   r   r   Ú<module>   s   %

