U
    Jc                      @   s   d dl Z d dlmZ d dl mZ dd Zdd Zdd Zd	d
 Zdd Z	ej
eje jje j dddZdej
eje jje j dddZdS )    N)nnc                 C   s,   t | | | }t |ddt j}|S )Nr      )torchroundclamptouint8)xscale
zero_pointy r   b/tmp/pip-unpacked-wheel-gikjz4vx/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py_quantize_per_tensor_cuda   s    r   c                 C   s   ||  tj|  }|S )N)r   r   float32)r   r
   r   r	   r   r   r   _dequantize_per_tensor_cuda   s    r   c                 C   sv   t j|  | jd}t|  d D ]6}t | |d d f ||  ||  ||d d f< q$t |ddt j}|S )Ndevicer   r   )	r   zerossizer   ranger   r   r   r   )r	   r
   r   r   ir   r   r   _quantize_per_channel_cuda   s
    4r   c                 C   sj   |  tj| j} tj| | jd}t| d D ]0}|| | |d d f ||   ||d d f< q4|S )Nr   r   )r   r   r   cudar   
zeros_liker   r   )r   r
   r   r	   r   r   r   r   _dequantize_per_channel_cuda   s
    .r   c                    s    fddt |D }|S )Nc                    s    g | ]}t j  j jd qS )r   dtype)r   r   r   r   ).0_all_gather_in_listr   r   
<listcomp>"   s   z+_get_allgather_out_list.<locals>.<listcomp>)r   )r!   
world_sizeZout_listr   r    r   _get_allgather_out_list!   s    
r$   )process_groupbucketreturnc           	         s   | dk	r| nt jj| dk	r$|  nt   | tj	 
j}| | \}}t||g
j}t| t j |dd }fdd} fdd}|||S )a  
    Applies the ``torch.quantize_per_tensor`` logic to DDP using ``allgather``
    protocol. Workers first allgather the scale and zero point of their own
    ``GradBucket`` prior to the quantization. After all workers have that information,
    the first ``then`` callback called ``quantize_and_allgather`` quantizes worker's
    own gradient tensor, and uses ``allgather`` to communicate these accross all workers.
    The final ``then`` callback called ``dequantize_and_aggregate``, dequantizes and
    aggregates each quantized gradient tensor locally and returns the mean.

    .. warning ::
        This is experimental, and uses ``allgather`` protocol which is considerably slower than
        ``allreduce`` protocol. It works only with flattened grads.

    Example::
        >>> # xdoctest: +SKIP
        >>> ddp_model.register_comm_hook(process_group, quantization_pertensor_hook)
    NTgroupZasync_opc                    sL   |   d }t| d | d }tjt|| dd } |   S Nr      Tr(   )waitr   dist
all_gatherr$   
get_futurefutall_ranks_s_and_zquantized_tensor)group_to_useranktensorr#   r   r   quantize_and_allgatherT   s     
 

z;quantization_pertensor_hook.<locals>.quantize_and_allgatherc                    s^   |   d }tj|d jtjd}t|D ](\}}|t| | d  | d 7 }q,| S Nr   r   r+   )r,   r   r   r   r   	enumerater   r1   Zall_ranks_quantized_tensorZaggregated_dequantized_tensorrr3   r2   r6   r#   r   r   dequantize_and_aggregatee   s       
 
z=quantization_pertensor_hook.<locals>.dequantize_and_aggregate)r-   r)   WORLDr5   get_rankr   bufferr   quantizationZMinMaxObserverr   r   calculate_qparamsZFloatTensorr$   r.   r/   then)	r%   r&   Z
myObserverszs_and_zr1   r7   r=   r   )r2   r4   r5   r6   r#   r   quantization_pertensor_hook-   s$    
   
rG      c           
         s   | dk	r| nt jj| dk	r$|  nt   | tjj	d|t
|  fdddd|jtj j}| | \}}t||fj}t| t j |dd }fdd	} fd
d}	|||	S )a  
    Applies the ``torch.quantize_per_channel`` logic to DDP using ``allgather``
    protocol. Compared to pertensor, the main motivation of perchannel is
    for considerably large tensors such as a tensor that contains 6 million
    elements quantizing per a bucket size of 512 (or 128) elements may significantly
    increase the resolution.

    It first splits ``GradBucket`` tensor into multiple chunks (channels) of ``bucket_size``
    elements. Then, workers allgather the scales and zero points of their own
    ``GradBucket`` prior to the quantization. After all workers have that information,
    the first ``then`` callback called ``quantize_and_allgather`` quantizes worker's
    own gradient tensor, and uses ``allgather`` to communicate these accross all workers.
    The final ``then`` callback called ``dequantize_and_aggregate``, dequantizes, flattens, and
    aggregates each quantized gradient tensor locally and returns the mean.

    .. warning ::
        This is experimental, and uses ``allgather`` protocol which is considerably slower than
        ``allreduce`` protocol. It works only with flattened grads.

    Example::
        >>> # xdoctest: +SKIP
        >>> ddp_model.register_comm_hook(process_group, quantization_perchannel_hook)
    Nr   Zconstant)inputpadmodevalueTr(   c                    sX   |   d }t|dd d f |dd d f }tjt|| dd } |   S r*   )r,   r   r-   r.   r$   r/   r0   )r4   r5   tensor_in_channelsr#   r   r   r7      s    
z<quantization_perchannel_hook.<locals>.quantize_and_allgatherc                    s|   |   d }tj|d jtjd}t|D ](\}}|t| | d  | d 7 }q,t|jd 	 d   S r8   )
r,   r   r   r   r   r9   r   flattenr   r   r:   r<   r   r   r=      s$       
 

z>quantization_perchannel_hook.<locals>.dequantize_and_aggregate)r-   r)   r>   r5   r?   r   r@   r   Z
functionalrJ   lenviewr   r   r   rA   ZPerChannelMinMaxObserverrB   stackr$   r.   r/   rC   )
r%   r&   Zbucket_sizeZmyPerChannelObserverZs_chZz_chrF   r1   r7   r=   r   )r2   r4   r5   r6   rN   r#   r   quantization_perchannel_hookw   s@     

   
rS   )rH   )r   Ztorch.distributedZdistributedr-   r   r   r   r   r   r$   ZProcessGroupZ
GradBucketZfuturesZFutureZTensorrG   rS   r   r   r   r   <module>   s"    K  