U
    &c(                     @   s`   d Z ddlZddlZG dd dejjjjZdddZ	G d	d
 d
ejjj
ZG dd deZdS )z?Functions and classes related to optimization (weight updates).    Nc                       s2   e Zd ZdZd
 fdd	Zdd Zdd	 Z  ZS )WarmUpzBApplies a warmup schedule on a given learning rate decay schedule.      ?Nc                    s,   t    || _|| _|| _|| _|| _d S N)super__init__initial_learning_ratewarmup_stepspowerdecay_schedule_fnname)selfr   r
   r   r	   r   	__class__ @/tmp/pip-unpacked-wheel-ymerj3tt/transformers/optimization_tf.pyr      s    
zWarmUp.__init__c              
      s   t  jpdr}t t j}t  jt j}|| } jt j| j	 t j
||k fdd fdd|dW  5 Q R  S Q R X d S )Nr   c                      s    S r   r   r   )warmup_learning_rater   r   <lambda>.       z!WarmUp.__call__.<locals>.<lambda>c                      s
     S r   )r
   r   )r   stepr   r   r   /   r   r   )tfZ
name_scoper   castZfloat32r   r   mathpowr	   Zcond)r   r   r   Zglobal_step_floatZwarmup_steps_floatZwarmup_percent_doner   )r   r   r   r   __call__$   s    
zWarmUp.__call__c                 C   s   | j | j| j| j| jdS )Nr   r
   r   r	   r   r   r   r   r   r   
get_config3   s    zWarmUp.get_config)r   N)__name__
__module____qualname____doc__r   r   r   __classcell__r   r   r   r   r      s      
r           adamwc                 C   sD   t jjjj| ||d}|r(t| ||d}t|ddddddgd	}|S )
z1Creates an optimizer with learning rate schedule.)r   Zdecay_stepsZend_learning_rate)r   r
   r   g{Gz??+?gư>Z
layer_normZbias)learning_rateweight_decay_ratebeta_1beta_2epsilonexclude_from_weight_decay)r   keras
optimizers	schedulesZPolynomialDecayr   AdamWeightDecay)Zinit_lrZnum_train_stepsZnum_warmup_stepsZend_lrZoptimizer_typeZlr_scheduleZ	optimizerr   r   r   create_optimizer=   s(    
    	r1   c                	       s   e Zd ZdZd fd	d
	Ze fddZ fddZdd Zd fdd	Z	dd Z
d fdd	Zd  fdd	Z fddZdd Z  ZS )!r0   a  Adam enables L2 weight decay and clip_by_global_norm on gradients.
  Just adding the square of the weights to the loss function is *not* the
  correct way of using L2 regularization/weight decay with Adam, since that will
  interact with the m and v parameters in strange ways.
  Instead we want ot decay the weights in a manner that doesn't interact with
  the m/v parameters. This is equivalent to adding the square of the weights to
  the loss with plain (non-momentum) SGD.
  MbP?r%   r&   Hz>Fr#   Nc
                    s0   t  j||||||	f|
 || _|| _|| _d S r   )r   r   r(   _include_in_weight_decay_exclude_from_weight_decay)r   r'   r)   r*   r+   Zamsgradr(   Zinclude_in_weight_decayr,   r   kwargsr   r   r   r   ^   s    zAdamWeightDecay.__init__c                    s   dt i}tt| j||dS )z?Creates an optimizer from its config with WarmUp custom object.r   )custom_objects)r   r   r0   from_config)clsconfigr7   r   r   r   r8   p   s    zAdamWeightDecay.from_configc                    s4   t t| ||| tj| jdd|||f d< d S )NZadam_weight_decay_rater   r(   )r   r0   _prepare_localr   constantr(   )r   
var_device	var_dtypeapply_stater   r   r   r;   v   s
     zAdamWeightDecay._prepare_localc                 C   sB   |  |j}|r:|j|| ||j|jjf d  | jdS t S )Nr(   )Zuse_locking)	_do_use_weight_decayr   Z
assign_subdevicedtype
base_dtypeZ_use_lockingr   Zno_op)r   varr'   r?   Zdo_decayr   r   r   _decay_weights_op|   s    z!AdamWeightDecay._decay_weights_opc                    s*   t t| \}}tt| jt|||dS )Nr   )listzipr   r0   apply_gradients)r   Zgrads_and_varsr   Zgradstvarsr   r   r   rH      s    zAdamWeightDecay.apply_gradientsc                 C   s^   |dkr| j | i fS |pi }|||f}|dkrL| ||}||||f< |d t|dfS )z1Retrieves the learning rate with the given state.Nlr_t)r?   )Z_decayed_lr_tgetZ_fallback_apply_statedict)r   r=   r>   r?   Zcoefficientsr   r   r   _get_lr   s    zAdamWeightDecay._get_lrc              
      s`   |  |j|jj|\}}| |||}t|g$ tt| j	||f|W  5 Q R  S Q R X d S r   )
rM   rA   rB   rC   rE   r   control_dependenciesr   r0   _resource_apply_dense)r   gradrD   r?   rJ   r6   decayr   r   r   rO      s    z%AdamWeightDecay._resource_apply_densec              
      sb   |  |j|jj|\}}| |||}t|g& tt| j	|||f|W  5 Q R  S Q R X d S r   )
rM   rA   rB   rC   rE   r   rN   r   r0   _resource_apply_sparse)r   rP   rD   indicesr?   rJ   r6   rQ   r   r   r   rR      s    z&AdamWeightDecay._resource_apply_sparsec                    s   t   }|d| ji |S )Nr(   )r   r   updater(   )r   r:   r   r   r   r      s    
zAdamWeightDecay.get_configc                 C   sb   | j dkrdS | jr6| jD ]}t||dk	r dS q| jr^| jD ]}t||dk	rB dS qBdS )z0Whether to use L2 weight decay for `param_name`.r   FNT)r(   r4   researchr5   )r   
param_namerr   r   r   r@      s    


z$AdamWeightDecay._do_use_weight_decay)	r2   r%   r&   r3   Fr#   NNr0   )N)N)N)r   r   r    r!   r   classmethodr8   r;   rE   rH   rM   rO   rR   r   r@   r"   r   r   r   r   r0   T   s*            	r0   c                   @   s@   e Zd ZdZdd Zedd Zedd Zdd	 Zd
d Z	dS )GradientAccumulatoraO  Gradient accumulation utility.
  When used with a distribution strategy, the accumulator should be called in a
  replica context. Gradients will be accumulated locally on each replica and
  without synchronization. Users should then call ``.gradients``, scale the
  gradients if required, and pass the result to ``apply_gradients``.
  c                 C   s   g | _ d| _dS )zInitializes the accumulator.N)
_gradients_accum_stepsr   r   r   r   r      s    zGradientAccumulator.__init__c                 C   s<   | j dkr2tjtjdtjddtjjtjjd| _ | j 	 S )zNumber of accumulated steps.Nr   )rB   FZ	trainableZsynchronizationZaggregation)
r\   r   Variabler<   Zint64VariableSynchronizationON_READVariableAggregationONLY_FIRST_REPLICAvaluer   r   r   r   r      s    
zGradientAccumulator.stepc                 C   s"   | j stdtdd | j D S )z1The accumulated gradients on the current replica.zBThe accumulator should be called first to initialize the gradientsc                 s   s   | ]}|  V  qd S r   )rc   .0gradientr   r   r   	<genexpr>   s     z0GradientAccumulator.gradients.<locals>.<genexpr>)r[   
ValueErrorrF   r   r   r   r   	gradients   s    zGradientAccumulator.gradientsc                 C   s~   | j s"| j}| j dd |D  t|t| j krNtdt| j t|f t| j |D ]\}}|| qZ| jd dS )z4Accumulates :obj:`gradients` on the current replica.c                 S   s,   g | ]$}t jt |d t jjt jjdqS )Fr]   )r   r^   
zeros_liker_   r`   ra   rb   rd   r   r   r   
<listcomp>   s   z0GradientAccumulator.__call__.<locals>.<listcomp>z!Expected %s gradients, but got %d   N)r[   r   extendlenrh   rG   Z
assign_addr\   )r   ri   _Zaccum_gradientrf   r   r   r   r      s    zGradientAccumulator.__call__c                 C   s6   | j s
dS | jd | j D ]}|t| qdS )z8Resets the accumulated gradients on the current replica.Nr   )r[   r\   Zassignr   rj   )r   rf   r   r   r   reset   s
    
zGradientAccumulator.resetN)
r   r   r    r!   r   propertyr   ri   r   rp   r   r   r   r   rZ      s   

rZ   )r#   r$   )r!   rU   Z
tensorflowr   r-   r.   r/   ZLearningRateScheduler   r1   ZAdamr0   objectrZ   r   r   r   r   <module>   s   &
e