U
    <c:                     @   sx   d dl mZmZ d dlZd dlmZ d dlmZ d dl	Z	d dl
Z
dgZG dd deZe Zdd	 ZG d
d deZdS )    )defaultdictabcN)deepcopy)chain	Optimizerc                   @   s   e Zd ZdZdd ZdS )_RequiredParameterzCSingleton class representing a required parameter for an Optimizer.c                 C   s   dS )Nz<required parameter> selfr   r   9/tmp/pip-unpacked-wheel-gikjz4vx/torch/optim/optimizer.py__repr__   s    z_RequiredParameter.__repr__N)__name__
__module____qualname____doc__r   r   r   r   r   r   
   s   r   c                    s    fdd}|S )Nc              	      s>   t  }z$t | jd   | f||}W 5 t | X |S )Ndifferentiable)torchZis_grad_enabledZset_grad_enableddefaults)r
   argskwargsZ	prev_gradretfuncr   r   	_use_grad   s    z/_use_grad_for_differentiable.<locals>._use_gradr   )r   r   r   r   r   _use_grad_for_differentiable   s    r   c                   @   sx   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd ZdedddZdd Zdd ZdS )r   aX  Base class for all optimizers.

    .. warning::
        Parameters need to be specified as collections that have a deterministic
        ordering that is consistent between runs. Examples of objects that don't
        satisfy those properties are sets and iterators over values of dictionaries.

    Args:
        params (iterable): an iterable of :class:`torch.Tensor` s or
            :class:`dict` s. Specifies what Tensors should be optimized.
        defaults: (dict): a dict containing default values of optimization
            options (used when a parameter group doesn't specify them).
    c                 C   s   t jd || _|   t|t jr8tdt | t	t
| _g | _t|}t|dkrdtdt|d t
s|d|ig}|D ]}| | qd| _d S )Nzpython.optimizerzZparams argument given to the optimizer should be an iterable of Tensors or dicts, but got r   z%optimizer got an empty parameter listparamsT)r   Z_CZ_log_api_usage_oncer   _hook_for_profile
isinstanceTensor	TypeErrortypenamer   dictstateparam_groupslistlen
ValueErroradd_param_group$_warned_capturable_if_run_uncaptured)r
   r   r   r#   param_groupr   r   r   __init__-   s"    

zOptimizer.__init__c                 C   s   | j | j| jdS )Nr   r"   r#   r+   r	   r   r   r   __getstate__J   s    zOptimizer.__getstate__c                 C   s&   | j | |   | jdd d S )Nr   F)__dict__updater   r   
setdefault)r
   r"   r   r   r   __setstate__Q   s    zOptimizer.__setstate__c                 C   sp   | j jd }t| jD ]L\}}|d7 }|d|7 }t| D ] }|dkr@|d||| 7 }q@q|d7 }|S )Nz (
zParameter Group {0}
r   z    {0}: {1}
))	__class__r   	enumerater#   formatsortedkeys)r
   format_stringigroupkeyr   r   r   r   V   s    zOptimizer.__repr__c                 C   sh   t jrdt j rdt j }|r<| jd s<td| jj d t	| ddsd| jd rd|sdt
d d| _d S )NZ
capturablez;Attempting CUDA graph capture of step() for an instance of z9 but this instance was constructed with capturable=False.r(   FzWarning: This instance was constructed with capturable=True, but step() is running without CUDA graph capture. If you never intend to graph-capture this instance, capturable=True can impair performance, and you should set capturable=False.T)r   Zhas_cudaZcudaZis_availableZis_current_stream_capturingr   RuntimeErrorr3   r   getattrprintr(   )r
   Z	capturingr   r   r    _cuda_graph_capture_health_checkb   s     

z*Optimizer._cuda_graph_capture_health_checkc                 C   s   dS )a  Entry point for `torch.profile.profiler`.

        When python tracing is enabled the profiler will hook into this
        function at the CPython level to inspect the optimizer's parameters and
        param groups. It is called it after `step()` since many optimizers
        lazily initialize state.

        This is a workaround due to lack of a proper step hook on the optimizer,
        and will be removed if it exists.
        Nr   r	   r   r   r   _optimizer_step_codeu   s    zOptimizer._optimizer_step_codec                 C   sJ   d | jj| _dd }t| jjdd }|sF|| jj| j_d| jj_d S )Nz Optimizer.zero_grad#{}.zero_gradc                    s   t   fdd}|S )Nc               
      sR   | ^}}d |jj}tjj|$  | |}|  |W  5 Q R  S Q R X d S )NzOptimizer.step#{}.step)r5   r3   r   r   autogradprofilerrecord_functionr@   )r   r   obj_Zprofile_nameoutr   r   r   wrapper   s    
zGOptimizer._hook_for_profile.<locals>.profile_hook_step.<locals>.wrapper)	functoolswraps)r   rG   r   r   r   profile_hook_step   s    z6Optimizer._hook_for_profile.<locals>.profile_hook_stephookedT)r5   r3   r   _zero_grad_profile_namer=   steprK   )r
   rJ   rK   r   r   r   r      s    zOptimizer._hook_for_profilec                    sL   i dfdd  fdd| j D }fdd| j D }||dS )	aK  Returns the state of the optimizer as a :class:`dict`.

        It contains two entries:

        * state - a dict holding current optimization state. Its content
            differs between optimizer classes.
        * param_groups - a list containing all parameter groups where each
            parameter group is a dict
        r   c                    sb   dd |   D }  fddt| d D   fdd| d D |d< t|d 7 |S )Nc                 S   s   i | ]\}}|d kr||qS )r   r   .0kvr   r   r   
<dictcomp>   s       z<Optimizer.state_dict.<locals>.pack_group.<locals>.<dictcomp>c                    s&   i | ]\}}t | krt ||qS r   id)rO   r9   pparam_mappingsr   r   rR      s     r   c                    s   g | ]} t | qS r   rS   )rO   rU   rV   r   r   
<listcomp>   s     z<Optimizer.state_dict.<locals>.pack_group.<locals>.<listcomp>)itemsr.   r4   r%   )r:   packed)rW   start_indexr   r   
pack_group   s
    "z(Optimizer.state_dict.<locals>.pack_groupc                    s   g | ]} |qS r   r   rO   g)r\   r   r   rX      s     z(Optimizer.state_dict.<locals>.<listcomp>c                    s.   i | ]&\}}t |tjr$ t| n||qS r   )r   r   r   rT   rN   rV   r   r   rR      s    z(Optimizer.state_dict.<locals>.<dictcomp>r"   r#   )r#   r"   rY   )r
   r#   Zpacked_stater   )r\   rW   r[   r   
state_dict   s    
zOptimizer.state_dictc                    s&  t |}| j}|d }t|t|kr.tddd |D }dd |D }tdd t||D rjtddd	 ttd
d |D tdd |D D }d fdd	 tt	}|d 
 D ]0\}}	||kr|| }
 |
|	||
< q|	||< qdd fddt||D }| ||d dS )zLoads the optimizer state.

        Args:
            state_dict (dict): optimizer state. Should be an object returned
                from a call to :meth:`state_dict`.
        r#   z<loaded state dict has a different number of parameter groupsc                 s   s   | ]}t |d  V  qdS r   Nr%   r]   r   r   r   	<genexpr>   s     z,Optimizer.load_state_dict.<locals>.<genexpr>c                 s   s   | ]}t |d  V  qdS ra   rb   r]   r   r   r   rc      s     c                 s   s   | ]\}}||kV  qd S Nr   )rO   Zp_lenZs_lenr   r   r   rc      s     z]loaded state dict contains a parameter group that doesn't match the size of optimizer's groupc                 S   s   i | ]\}}||qS r   r   )rO   Zold_idrU   r   r   r   rR      s      z-Optimizer.load_state_dict.<locals>.<dictcomp>c                 s   s   | ]}|d  V  qdS ra   r   r]   r   r   r   rc      s     c                 s   s   | ]}|d  V  qdS ra   r   r]   r   r   r   rc      s     Nc                    s   t |tjr8|dkr4  r(| j}| j}|S t |trZ fdd| D S t |t	j
rt| fdd|D S |S dS )zBMake a deep copy of value, casting all tensors to device of param.rM   c                    s    i | ]\}}| ||d qS ))r;   r   rN   castparamr   r   rR      s      z;Optimizer.load_state_dict.<locals>.cast.<locals>.<dictcomp>c                 3   s   | ]} |V  qd S rd   r   )rO   rQ   re   r   r   rc      s     z:Optimizer.load_state_dict.<locals>.cast.<locals>.<genexpr>N)r   r   r   Zis_floating_pointtodtypedevicer!   rY   container_abcsIterabletype)rg   valuer;   )rf   )rg   r   rf      s    
z'Optimizer.load_state_dict.<locals>.castr"   c                 S   s   | d |d< |S )Nr   r   )r:   Z	new_groupr   r   r   update_group   s    z/Optimizer.load_state_dict.<locals>.update_groupc                    s   g | ]\}} ||qS r   r   )rO   r^   Zng)ro   r   r   rX      s    z-Optimizer.load_state_dict.<locals>.<listcomp>r_   )N)r   r#   r%   r&   anyzipr   from_iterabler   r!   rY   r0   )r
   r`   groupsZsaved_groupsZ
param_lensZ
saved_lensZid_mapr"   rP   rQ   rg   r#   r   )rf   ro   r   load_state_dict   s4    

zOptimizer.load_state_dictF)set_to_nonec           	   	   C   s  | j dd}t| ds |   |r0tdd }tjj| j	 | j
D ]}|d D ]r}|jdk	rT|rnd|_qT|jjdk	r|j  n|jd |r|jjr|j  qT||jj |jj |j qTqH|r| D ] \}}| D ]}t| qqW 5 Q R X dS )a  Sets the gradients of all optimized :class:`torch.Tensor` s to zero.

        Args:
            set_to_none (bool): instead of setting to zero, set the grads to None.
                This will in general have lower memory footprint, and can modestly improve performance.
                However, it changes certain behaviors. For example:
                1. When the user tries to access a gradient and perform manual ops on it,
                a None attribute or a Tensor full of 0s will behave differently.
                2. If the user requests ``zero_grad(set_to_none=True)`` followed by a backward pass, ``.grad``\ s
                are guaranteed to be None for params that did not receive a gradient.
                3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
                (in one case it does the step with a gradient of 0 and in the other it skips
                the step altogether).
        foreachFrL   c                   S   s   t tS rd   )r   r$   r   r   r   r   <lambda>
      z%Optimizer.zero_grad.<locals>.<lambda>r   N)r   gethasattrr   r   r   rA   rB   rC   rL   r#   ZgradZgrad_fnZdetach_Zrequires_grad_Z	is_sparseZzero_rj   ri   appendrY   valuesZ_foreach_zero_)	r
   ru   rv   Zper_device_and_dtype_gradsr:   rU   rE   Zper_dtype_gradsZgradsr   r   r   	zero_grad   s*    


 zOptimizer.zero_gradc                 C   s   t dS )af  Performs a single optimization step (parameter update).

        Args:
            closure (Callable): A closure that reevaluates the model and
                returns the loss. Optional for most optimizers.

        .. note::
            Unless otherwise specified, this function should not modify the
            ``.grad`` field of the parameters.
        N)NotImplementedError)r
   closurer   r   r   rM     s    zOptimizer.stepc                 C   s\  t |tstd|d }t |tjr2|g|d< n t |trFtdnt||d< |d D ]D}t |tjs|tdt| | j	
ddsZ|jsZ|jsZtdqZ| j	 D ]2\}}|tkr||krtd| q||| q|d }t|tt|kr
tjd	d
d t }| jD ]}|t|d  q|t|d sLtd| j| dS )a  Add a param group to the :class:`Optimizer` s `param_groups`.

        This can be useful when fine tuning a pre-trained network as frozen layers can be made
        trainable and added to the :class:`Optimizer` as training progresses.

        Args:
            param_group (dict): Specifies what Tensors should be optimized along with group
                specific optimization options.
        zparam group must be a dictr   zoptimizer parameters need to be organized in ordered collections, but the ordering of tensors in sets will change between runs. Please use a list instead.z>optimizer can only optimize Tensors, but one of the params is r   Nz can't optimize a non-leaf TensorzJparameter group didn't specify a value of required optimization parameter zoptimizer contains a parameter group with duplicate parameters; in future, this will cause an error; see github.com/pytorch/pytorch/issues/40967 for more information   )
stacklevelz7some parameters appear in more than one parameter group)r   r!   AssertionErrorr   r   setr   r$   r    r   ry   Zis_leafZretains_gradr&   rY   requiredr/   r%   warningswarnr#   r.   
isdisjointr{   )r
   r)   r   rg   namedefaultZ	param_setr:   r   r   r   r'   ,  s>    




zOptimizer.add_param_groupN)F)r   r   r   r   r*   r,   r0   r   r?   r@   r   r`   rt   boolr}   rM   r'   r   r   r   r   r      s   @))collectionsr   r   rk   r   copyr   	itertoolsr   r   rH   __all__objectr   r   r   r   r   r   r   r   <module>   s   