U
    &c                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ ee	Z
dddZddd	Zdd
dZdddZdddZG dd deZdS )z$PyTorch optimization for BERT model.    N)	OptimizerLambdaLRc                 C   s   t | dd |dS )z6 Create a schedule with a constant learning rate.
    c                 S   s   dS )N    )_r   r   =/tmp/pip-unpacked-wheel-ymerj3tt/transformers/optimization.py<lambda>       z'get_constant_schedule.<locals>.<lambda>
last_epochr   )	optimizerr   r   r   r	   get_constant_schedule   s    r   c                    s    fdd}t | ||dS )z Create a schedule with a constant learning rate preceded by a warmup
    period during which the learning rate increases linearly between 0 and 1.
    c                    s"   |  k rt | t td  S dS )N      ?floatmaxcurrent_stepnum_warmup_stepsr   r	   	lr_lambda'   s    z4get_constant_schedule_with_warmup.<locals>.lr_lambdar   r   )r   r   r   r   r   r   r	   !get_constant_schedule_with_warmup"   s    r   c                    s    fdd}t | ||S )zz Create a schedule with a learning rate that decreases linearly after
    linearly increasing during a warmup period.
    c                    sB   | k rt | t td S tdt  |  t td   S )Nr           r   r   num_training_stepsr   r   r	   r   4   s     z2get_linear_schedule_with_warmup.<locals>.lr_lambdar   )r   r   r   r   r   r   r   r	   get_linear_schedule_with_warmup/   s    r         ?c                    s    fdd}t | ||S )z Create a schedule with a learning rate that decreases following the
    values of the cosine function between 0 and `pi * cycles` after a warmup
    period during which it increases linearly between 0 and 1.
    c              	      sf   | k rt | t td S t |  t td  }tdddttjt   d |   S )Nr   r   r   r   g       @r   r   mathcospir   progress
num_cyclesr   r   r   r	   r   D   s    z2get_cosine_schedule_with_warmup.<locals>.lr_lambdar   r   r   r   r&   r   r   r   r%   r	   get_cosine_schedule_with_warmup>   s    r(   r   c                    s    fdd}t | ||S )z Create a schedule with a learning rate that decreases following the
    values of the cosine function with several hard restarts, after a warmup
    period during which it increases linearly between 0 and 1.
    c              	      sr   | k rt | t td S t |  t td  }|dkrHdS tdddttjt  | d    S )Nr   r   r   r   r   r#   r%   r   r	   r   U   s    zEget_cosine_with_hard_restarts_schedule_with_warmup.<locals>.lr_lambdar   r'   r   r%   r	   2get_cosine_with_hard_restarts_schedule_with_warmupM   s    r)   c                       s,   e Zd ZdZd fdd	Zdd
dZ  ZS )AdamWa   Implements Adam algorithm with weight decay fix.

    Parameters:
        lr (float): learning rate. Default 1e-3.
        betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
        eps (float): Adams epsilon. Default: 1e-6
        weight_decay (float): Weight decay. Default: 0.0
        correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
    MbP?g?g+?ư>r   Tc                    s   |dk rt d|d|d   kr.dk sBn t d|d d|d   krZdk snn t d|d d|kst d|t|||||d}t || d S )	Nr   z,Invalid learning rate: {} - should be >= 0.0r   r   z4Invalid beta parameter: {} - should be in [0.0, 1.0[r   z,Invalid epsilon value: {} - should be >= 0.0)lrbetasepsweight_decaycorrect_bias)
ValueErrorformatdictsuper__init__)selfparamsr.   r/   r0   r1   r2   defaults	__class__r   r	   r7   k   s    zAdamW.__init__Nc                 C   s  d}|dk	r| }| j D ]b}|d D ]R}|jdkr8q&|jj}|jrNtd| j| }t|dkrd|d< t|j|d< t|j|d< |d |d  }}|d \}	}
|d  d	7  < |	|	
d
|	 | |	|
d
|
 || | 
|d }|d }|d r>d
|	|d   }d
|
|d   }|t| | }|j| || |d dkr&|j
|d  |d  |j q&q|S )zPerforms a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr9   zJAdam does not support sparse gradients, please consider SparseAdam insteadr   stepexp_avg
exp_avg_sqr/   r   r   r0   r.   r2   r1   r   )Zparam_groupsgraddataZ	is_sparseRuntimeErrorstatelentorchZ
zeros_likeZmul_Zadd_Zaddcmul_sqrtr    Zaddcdiv_)r8   closureZlossgrouppr@   rC   r>   r?   Zbeta1Zbeta2ZdenomZ	step_sizeZbias_correction1Zbias_correction2r   r   r	   r=   w   s<    



"z
AdamW.step)r+   r,   r-   r   T)N)__name__
__module____qualname____doc__r7   r=   __classcell__r   r   r;   r	   r*   `   s   
r*   )r   )r   )r   )r   r   )r   r   )rM   loggingr    rE   Ztorch.optimr   Ztorch.optim.lr_schedulerr   	getLoggerrJ   loggerr   r   r   r(   r)   r*   r   r   r   r	   <module>   s   




   
