U
    Jºc^  ã                   @   sV   d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ G dd„ dejƒZdS )é    )ÚListÚUnionÚMappingÚDictÚAnyN)ÚTensor)ÚShardedTensorc                   @   s~   e Zd Zeeeeef f dœdd„Zde	dœdd„Z
dd	d
„Zeeef dœdd„Zeeef dœdd„Zedœdd„ZdS )ÚShardedOptimizer)Únamed_paramsc                 O   sr   g }|  ¡ D ]4}t|tƒr6| ¡ D ]}| |j¡ q"q| |¡ q|| _||f|ž|Ž| _| jj| _| jj	| _	dS )aª  
        ShardedOptimizer collects all tensors and local shard tensors of
        ShardedTensor, then use these tensors as ``params`` for optimizers

        Args:
            named_params (Dict[str, Union[Tensor, ShardedTensor]]) : a Dict
                of parameters, where key is the parameter key, value is either
                Tensor or ShardedTensor parameter.
            optimizer_class (torch.optim.Optimizer): the Optimizer to use
                locally, i.e. torch.optim.SGD, torch.optim.Adagrad, etc.
            *optimizer_args: the arguments to initialize the optimizer.
            **optimizer_kwargs: the key-word arguments to initialize the optimizer.

        N)
ÚvaluesÚ
isinstancer   Zlocal_shardsÚappendZtensorr
   Ú_optimZparam_groupsÚstate)Úselfr
   Zoptimizer_classZoptimizer_argsZoptimizer_kwargsZtensorsÚvalueZlocal_shard© r   úN/tmp/pip-unpacked-wheel-gikjz4vx/torch/distributed/_shard/sharded_optim/api.pyÚ__init__	   s    

zShardedOptimizer.__init__F)Úset_to_nonec                 C   s   | j  |¡ dS )a¶  Sets the gradients of all optimized :class:`torch.Tensor` s to zero.

        Args:
            set_to_none (bool): instead of setting to zero, set the grads to None.
                This will in general have lower memory footprint, and can modestly improve performance.
                However, it changes certain behaviors. For example:
                1. When the user tries to access a gradient and perform manual ops on it,
                a None attribute or a Tensor full of 0s will behave differently.
                2. If the user requests ``zero_grad(set_to_none=True)`` followed by a backward pass, ``.grad``\ s
                are guaranteed to be None for params that did not receive a gradient.
                3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
                (in one case it does the step with a gradient of 0 and in the other it skips
                the step altogether).
        N)r   Ú	zero_grad)r   r   r   r   r   r   +   s    zShardedOptimizer.zero_gradNc                 C   s   | j  |¡ dS )af  Performs a single optimization step (parameter update).

        Args:
            closure (Callable): A closure that reevaluates the model and
                returns the loss. Optional for most optimizers.

        .. note::
            Unless otherwise specified, this function should not modify the
            ``.grad`` field of the parameters.
        N)r   Ústep)r   Úclosurer   r   r   r   <   s    zShardedOptimizer.step)Úreturnc                 C   s   t dƒ‚dS )zë
        Returned state and param_groups will contain parameter keys
        instead of parameter indices like torch.optim.Optimizer.
        This allows for advanced functionality like optimizer re-sharding to be implemented.
        z0ShardedOptimizer state_dict not implemented yet!N©ÚNotImplementedError)r   r   r   r   Ú
state_dictI   s    zShardedOptimizer.state_dict)r   c                 C   s   t dƒ‚dS )zÀLoads the ShardedOptimizer state.

        Args:
            state_dict (dict): ShardedOptimizer state. Should be an object returned
                from a call to :meth:`state_dict`.
        z5ShardedOptimizer load_state_dict not implemented yet!Nr   )r   r   r   r   r   Úload_state_dictS   s    z ShardedOptimizer.load_state_dict)Úparam_groupc                 C   s   t dƒ‚dS )zAdd a new param group
        z5ShardedOptimizer add_param_group not implemented yet!Nr   )r   r   r   r   r   Úadd_param_group]   s    z ShardedOptimizer.add_param_group)F)N)Ú__name__Ú
__module__Ú__qualname__r   Ústrr   r   r   r   Úboolr   r   r   r   r   r   r   r   r   r   r   r	      s   þ"


r	   )Útypingr   r   r   r   r   Ztorch.optimZoptimZtorchr   Z'torch.distributed._shard.sharded_tensorr   Z	Optimizerr	   r   r   r   r   Ú<module>   s   