U
    <cj                     @   s@  d dl mZ d dlZd dlmZmZmZmZmZ d dl	Z	d dl	m
Z
 ddlmZmZ ddgZG d	d
 d
Zddeee	jjj e	jee dddZG dd deZdee
 ee
 ee
 ee
 ee
 ee
 ee eeeee ee eeeeeeedddZee
 ee
 ee
 ee
 ee
 ee
 ee ee eeeeeeeeedddZee
 ee
 ee
 ee
 ee
 ee
 ee ee eeeeeeeeedddZe	 ee
 ee
 ee
 ee
 ee
 ee
 eeee	jf eee
  f dddZee
 ee
 ee
 ee
 ee
 ee
 ee ee eeeeeeeeeddddZdS )    )defaultdictN)castListOptionalDictTuple)Tensor   )	Optimizer_use_grad_for_differentiableAdamadamc                   @   sD   e Zd ZU eed< eeef ed< eddddZeddd	ZdS )
_MultiDeviceReplicatormain_tensor_per_device_tensorsN)r   returnc                 C   s   || _ t|j|i| _d S N)r   strdevicer   )selfr    r   4/tmp/pip-unpacked-wheel-gikjz4vx/torch/optim/adam.py__init__   s    z_MultiDeviceReplicator.__init__r   c                 C   s4   || j kr| j | S | jj|ddd}|| j |< |S )NT)r   non_blockingcopy)r   r   to)r   r   tensorr   r   r   get   s
    


z_MultiDeviceReplicator.get)	__name__
__module____qualname__r   __annotations__r   r   r   r   r   r   r   r   r      s   
r   grad_scaler)	optimizerr$   r   r   c              	      sl   |d krd S | | } fdd| D }t|dks@tdt  ttjt|}W 5 Q R X t	|S )Nc                    s   g | ]}|j  d dqS )T)r   )r   ).0fr   r   r   
<listcomp>,   s     z'_get_fp16AMP_params.<locals>.<listcomp>r   z5No inf checks were recorded in _check_inf_per_device.)
Z_check_inf_per_devicevalueslenAssertionErrortorchno_gradr   r   sumr   )r%   r$   r   Zfound_inf_dictZ
found_infsZfound_inf_combinedr   r   r   _get_fp16AMP_params    s    

r/   c                       sb   e Zd ZdZdddddddee eeeed fd	d
Z fddZedddddZ	  Z
S )r   au  Implements Adam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \beta_1, \beta_2
                \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)}          \\
            &\hspace{13mm}      \lambda \text{ (weight decay)},  \: \textit{amsgrad},
                \:\textit{maximize}                                                              \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0\leftarrow 0 \text{ (second moment)},\: \widehat{v_0}^{max}\leftarrow 0\\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\

            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm}\textbf{if} \: \lambda \neq 0                                           \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\textbf{if} \: amsgrad                                                  \\
            &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max},
                \widehat{v_t})                                                                   \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big)                                 \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (bool, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        foreach (bool, optional): whether foreach implementation of optimizer
            is used (default: None)
        maximize (bool, optional): maximize the params based on the objective, instead of
            minimizing (default: False)
        capturable (bool, optional): whether this instance is safe to capture in a CUDA graph.
            Passing True can impair ungraphed performance, so if you don't intend to
            graph capture this instance, leave it False (default: False)
        fused (bool, optional): whether fused implementation of optimizer is used.
            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
            are supported. (default: False)

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    MbP?g?g+?:0yE>r   FN)foreachmaximize
capturabledifferentiablefusedc                   s   d|kst d|d|ks,t d|d|d   krDdk sXn t d|d d|d   krpdk sn t d|d d|kst d	|t||||||||	|
|d

}tt| || |r|
rtdd| _tdd | j	D stdd S )N        zInvalid learning rate: {}zInvalid epsilon value: {}r   g      ?z%Invalid beta parameter at index 0: {}r	   z%Invalid beta parameter at index 1: {}zInvalid weight_decay value: {})
lrbetasepsweight_decayamsgradr4   r3   r5   r6   r7   z"`fused` cannot be `differentiable`Tc                 s   s,   | ]$}|d  D ]}|j o t|V  qqdS )paramsN)is_cudar,   Zis_floating_point)r&   Zpgpr   r   r   	<genexpr>   s    
 z Adam.__init__.<locals>.<genexpr>z<FusedAdam requires all the params to be CUDA, floating point)

ValueErrorformatdictsuperr   r   RuntimeErrorZ_step_supports_amp_scalingallparam_groups)r   r>   r9   r:   r;   r<   r=   r3   r4   r5   r6   r7   defaults	__class__r   r   r   w   s8        zAdam.__init__c                    s   t  | | jD ]L}|dd |dd |dd  |dd |dd |dd qt| j }t|dkot	|d d	 }|s|D ]}t
t|d	 |d	< qd S )
Nr=   Fr4   r3   r5   r6   r7   r   step)rE   __setstate__rH   
setdefaultliststater)   r*   r,   Z	is_tensorr   float)r   rP   groupZstate_valuesZstep_is_tensorsrJ   r   r   rM      s    
zAdam.__setstate__r#   c                C   s*  |    d}|dk	r.t  | }W 5 Q R X | jD ]}g }g }g }g }g }	g }
|d \}}d}d}|d r|dk	r| }|j}t|}t| ||d}|d D ],}|jdk	r|	| |jj
rtd|	|j | j| }t|dkrp| jd s| jd rtjd	tj|jd
ntd|d< tj|tjd|d< tj|tjd|d< |d rptj|tjd|d< |	|d  |	|d  |d r|		|d  |d r|d jrtd|
	|d  qt|||||	|
|d |||d |d |d |d |d |d |d |d ||d q4|S )aN  Performs a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
            grad_scaler (:class:`torch.cuda.amp.GradScaler`, optional): A GradScaler which is
                supplied from ``grad_scaler.step(optimizer)``.
        Nr:   r7   )r%   r$   r   r>   zJAdam does not support sparse gradients, please consider SparseAdam insteadr   r5   r	   )dtyper   r8   rL   )Zmemory_formatexp_avg
exp_avg_sqr=   Zmax_exp_avg_sqr6   zB`requires_grad` is not supported for `step` in differentiable moder9   r<   r;   r4   r3   )r=   beta1beta2r9   r<   r;   r4   r3   r5   r6   r7   
grad_scale	found_inf)Z _cuda_graph_capture_health_checkr,   Zenable_gradrH   Z_get_scale_asyncr   r   r/   gradappendZ	is_sparserF   rP   r*   rI   zerosrQ   r   Z
zeros_likeZpreserve_formatZrequires_gradr   )r   closurer$   ZlossrR   Zparams_with_gradgradsexp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepsrX   rY   rZ   r[   r   r@   rP   r   r   r   rL      s    






z	Adam.step)r0   r1   r2   r   F)N)r   r    r!   __doc__r   boolr   rM   r   rL   __classcell__r   r   rJ   r   r   2   s(   D        "F)r>   r`   ra   rb   rc   rd   r3   r5   r6   r7   rZ   r[   r=   rX   rY   r9   r<   r;   r4   c                C   s   t dd |D std|dkr&d}|r<tj r<td|rPtj sPt}n|	rdtj sdt}nt}|| |||||||||||||||
|d dS )zmFunctional API that performs Adam algorithm computation.
    See :class:`~torch.optim.Adam` for details.
    c                 s   s   | ]}t |tjV  qd S r   )
isinstancer,   r   )r&   tr   r   r   rA     s     zadam.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNFz6torch.jit.script not supported with foreach optimizers)r=   rX   rY   r9   r<   r;   r4   r5   r6   rZ   r[   )rG   rF   r,   ZjitZis_scripting_multi_tensor_adam_fused_adam_single_tensor_adam)r>   r`   ra   rb   rc   rd   r3   r5   r6   r7   rZ   r[   r=   rX   rY   r9   r<   r;   r4   funcr   r   r   r     s:    )r>   r`   ra   rb   rc   rd   rZ   r[   r=   rX   rY   r9   r<   r;   r4   r5   r6   c       	         C   sJ  |d kr|d kst t| D ]&\}}|s2|| n||  }|| }|| }|| }|rl|jrd|jslt d|d7 }|dkr|j||d}t|rt|}t|}t|}t|}||	j|d|	 d ||
j	||
 d|
 d |s|r|}dt|	| }dt|
| }|| }| }| }|r|rR||  }n|| }|| t|| ||  ||  || }n| ||  || }||| q| }d|	|  }d|
|  }|| }t|}|r tj|| ||| d ||  | |}n| | |}|j||| d qd S )N@If capturable=True, params and state_steps must be CUDA tensors.r	   r   alpha)value)out)r+   	enumerater?   addr,   
is_complexview_as_realZmul_Zadd_Zaddcmul_ZconjpownegsqrtcloneZcopy_maximumZaddcdiv_itemmath)r>   r`   ra   rb   rc   rd   rZ   r[   r=   rX   rY   r9   r<   r;   r4   r5   r6   iparamr\   rV   rW   Zstep_trL   bias_correction1bias_correction2	step_sizeZstep_size_negbias_correction2_sqrtZmax_exp_avg_sqs_idenomr   r   r   rl   ?  sV    





 
rl   c       	            s  t | dkrd S |r4tdd t| |D s4td|d krD|d ksHt|rZtt|}|rftddd |D }dd |D }d	d |D }d
d | D }t|d |dkrtj|| |d t|  tj||d  d t| t	|||d  |r& fdd|D }fdd|D }t
|d t
|d t| t| t|}t| t| t|}|rt|| t|}t|t|| t||}t| t||}n@t|}t|t|| t||}t| t||}t||| n fdd|D }fdd|D }fdd|D }dd |D }|rt|| t|}t|| t||}n"t|}t|| t||}t|||| d S )Nr   c                 s   s   | ]\}}|j o|j V  qd S r   )r?   )r&   r@   rL   r   r   r   rA     s     z%_multi_tensor_adam.<locals>.<genexpr>rn   z#_foreach ops don't support autogradc                 S   s$   g | ]}t |rt |n|qS r   r,   ru   rv   r&   xr   r   r   r(     s     z&_multi_tensor_adam.<locals>.<listcomp>c                 S   s$   g | ]}t |rt |n|qS r   r   r   r   r   r   r(     s     c                 S   s$   g | ]}t |rt |n|qS r   r   r   r   r   r   r(     s     c                 S   s$   g | ]}t |rt |n|qS r   r   r   r   r   r   r(     s     r	   ro   c                    s   g | ]}t  |qS r   r,   rw   r&   rL   rX   r   r   r(     s     c                    s   g | ]}t  |qS r   r   r   rY   r   r   r(     s     c                    s   g | ]}d  |    qS rT   r|   r   r   r   r   r(     s     c                    s   g | ]}d  |    qS rT   r   r   r   r   r   r(     s     c                    s   g | ]} | d  qS )r   r&   Zbc)r9   r   r   r(     s     c                 S   s   g | ]}t |qS r   )r}   ry   r   r   r   r   r(     s     )r*   rG   zipr+   r,   Z_foreach_negtuple_foreach_add_Z_foreach_mul_Z_foreach_addcmul__foreach_sub_Z_foreach_neg_Z_foreach_divZ_foreach_reciprocal_Z_foreach_sqrtZ_foreach_maximum_Z_foreach_div_Z_foreach_mulZ_foreach_addZ_foreach_addcdiv_)r>   r`   ra   rb   rc   rd   rZ   r[   r=   rX   rY   r9   r<   r;   r4   r5   r6   Zparams_r   r   r   r   Zmax_exp_avg_sq_sqrtZeps_over_step_sizer   Zexp_avg_sq_sqrtr   )rX   rY   r9   r   rj     st    










rj   )r>   r`   ra   rb   rc   rd   r   c                 C   s   t dd }tt| |D ]\}\}}	t|j|jf}
||
 d | ||
 d ||  ||
 d ||  ||
 d ||  |r||
 d ||  ||
 d |	 q|S )	Nc                   S   s   dd t dD S )Nc                 S   s   g | ]}g qS r   r   )r&   _r   r   r   r(     s     zG_group_params_by_device_and_dtype.<locals>.<lambda>.<locals>.<listcomp>   )ranger   r   r   r   <lambda>      z3_group_params_by_device_and_dtype.<locals>.<lambda>r   r	               )r   rs   r   r   r   rU   r]   )r>   r`   ra   rb   rc   rd   Zper_device_and_dtype_tensorsr~   r@   rL   keyr   r   r   !_group_params_by_device_and_dtype  s    	r   )r>   r`   ra   rb   rc   rd   rZ   r[   r=   rX   rY   r9   r<   r;   r4   r5   r6   r   c       	         C   s   t | |||||}|D ]\}}|||f \}}}}}}|d k	r\|d k	r\||}||}nd }d }t|d tj|||||||||	|
|||||d |d k	rt||gt|  qd S )Nr	   )	r=   r9   rX   rY   r<   r;   r4   rZ   r[   )r   r   r,   r   Z_fused_adam_r   r*   )r>   r`   ra   rb   rc   rd   rZ   r[   r=   rX   rY   r9   r<   r;   r4   r5   r6   Zgrouped_tensorsr   rU   Zdevice_paramsZdevice_gradsZdevice_exp_avgsZdevice_exp_avg_sqsZdevice_max_exp_avg_sqsZdevice_state_stepsZdevice_grad_scaleZdevice_found_infr   r   r   rk   '  sF    

rk   )NFFFNN) collectionsr   r}   typingr   r   r   r   r   r,   r   r%   r
   r   __all__r   ZcudaampZ
GradScalerr   r/   r   rf   rQ   r   rl   rj   r-   r   rU   r   rk   r   r   r   r   <module>   s    X      >`r