U
    Jc;                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlm Z! d dl"m   m#  m$  m%Z% d dl&m'Z' d dl(m'  m)Z* d dl+m,Z, d dlm-Z- d dl.m/Z/m0Z0m1Z1 d d	l"m2Z2 d d
l3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z:m;Z; d dl<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZH ddlImJZJmKZK ddlLmMZMmNZNmOZOmPZPmQZQmRZRmSZS ddlTmUZUmVZVmWZWmXZXmYZY ddlZm[Z[m\Z\m]Z] ddl^m_Z_m`Z`maZambZb dZczd dldmeZemfZf W n egk
rN   dZcY nX dZheiedsddZhehr~ddljmkZkmlZlmmZm ddddddd d!d"d#d$d%d&gZnd'Zoeod( e\ d( Zpeqd)ZrG d*d deZse
G d+d dZte
G d,d dZuG d-d deZvG d.d% d%eZwG d/d deZxe
G d0d  d Zye
G d1d! d!eyZze
G d2d" d"eyZ{e
G d3d# d#eyZ|exj}ezexj~e{exje|iZG d4d$ d$eZeeVd5f ZG d6d7 d7eZG d8d9 d9ZG d:d; d;ZesjeXjesjeXjesjeXjiZG d<d de'jZeej'j= eejd=d>d?ZdJej'jeeej'j=ee f d@dAdBZej'jeej'j=ef dCdDdEZej'jeeej'j=f dCdFdGZeedHdId&ZdS )K    N)contextmanager)	dataclass)Enumauto)AnyCallableDequeDict	GeneratorIterableIteratorListMapping
NamedTupleOptionalSetTupleUnioncast)Variable)ProcessGroup)ShardShardedTensorinit_from_local_shards)_CHECKPOINT_PREFIX)LOW_PRECISION_HOOKSdefault_hooks)_get_default_group)_replace_by_prefix_sync_params_and_buffers
_to_kwargs)	Parameter   )
 _broadcast_pos_dim_tensor_states%_broadcast_processed_optim_state_dict_flatten_optim_state_dict_get_param_id_to_param'_get_param_id_to_param_from_optim_input_get_param_to_param_id'_get_param_to_param_id_from_optim_input_optim_state_dict_process_pos_dim_tensor_state_rekey_sharded_optim_state_dict)_ext_chunk_tensor"_ext_pre_load_state_dict_transform)_apply_to_modules_apply_to_tensors_contains_batchnorm_free_storage_is_fsdp_flattened#_override_batchnorm_mixed_precisionp_assert)FlatParameterFlatParamHandleHandleConfigHandleShardingStrategyHandleTrainingState)
FLAT_PARAM
FPW_MODULEFlattenParamsWrapper)ParamExecOrderWrapPolicy
_or_policy_recursive_wrap_wrap_batchnorm_individuallyT)deferred_initfakeFZfx)TracingConfig_init_execution_info_patch_tracerFullyShardedDataParallelShardingStrategyMixedPrecision
CPUOffloadBackwardPrefetchStateDictTypeStateDictConfigFullStateDictConfigLocalStateDictConfigShardedStateDictConfigOptimStateKeyTypeTrainingState_clean_tensor_name_fsdp_wrapped_module.i  c                   @   s"   e Zd ZdZe Ze Ze ZdS )rH   a  
    This specifies the sharding strategy to be used for distributed training by
    :class:`FullyShardedDataParallel`.
    FULL_SHARD: Parameters, gradients, and optimizer states are sharded. For
                the parameters, this algorithm all-gathers before the forward,
                reshards after the forward, all-gathers before the backward
                computation, and reshards after the backward computation. The
                gradients are synchronized and sharded via reduce-scatter after
                the backward computation. The sharded optimizer states are
                updated locally.
    SHARD_GRAD_OP: Gradients and optimizer states are sharded during
                   computation, and additionally parameters are sharded outside
                   computation. For the parameters, this algorithm all-gathers
                   before the forward, does not reshard after the forward, and
                   only reshards after the backward computation. The gradients
                   are synchronized and sharded via reduce-scatter after the
                   backward computation. The sharded optimizer states are
                   updated locally. Inside ``no_sync()``, the parameters are
                   not resharded after the backward computation.
    NO_SHARD: Parameters, gradients, and optimizer states are not sharded but
              instead replicated across ranks, similar to PyTorch's
              ``DistributedDataParallel`` API. The gradients are synchronized
              via all-reduce after the backward computation. The unsharded
              optimizer states are updated locally.
    HYBRID_SHARD(future support): Apply ``FULL_SHARD`` intra-node and
                                  ``NO_SHARD`` inter-node.

    N)__name__
__module____qualname____doc__r   
FULL_SHARDSHARD_GRAD_OPNO_SHARD r]   r]   V/tmp/pip-unpacked-wheel-gikjz4vx/torch/distributed/fsdp/fully_sharded_data_parallel.pyrH      s   c                   @   sX   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZee ed< dS )rI   a	  
    A config to enable mixed precision training with FullyShardedDataParallel.
    This class can be constructed with several flags:
        ``param_dtype`` controls the precision of model parameters, inputs, and
        therefore the precision under which computation happens. After forward
        and backward passes, FSDP parameters point to full precision shards
        that are kept in memory. Full precision parameters are always
        checkpointed.
        ``reduce_dtype`` controls the precision under which gradient reduction
        would occur, which can potentially be different than ``param_dtype``
        for use cases such as communication efficiency.
        ``buffer_dtype`` controls the precision that buffers are cast to. Note
        that buffers are unsharded and are cast in the first forward pass, and
        remain in their reduced precision state even after forward/backward
        passes. However, when taking checkpoints with ``state_dict``, buffers
        are checkpointed in their full precision (and then restored back to
        to their reduced precision) as expected. Note that this checkpoint
        support is currently limited to ``StateDictType.FULL_STATE_DICT``.
        ``keep_low_precision_grads``: Whether to upcast gradients back to the
        full parameter precision after backwards or not. This can be disabled
        to keep the gradients in the lower precision, which can potentially
        save memory if custom Optimizers are able to perform parameter updates
        effectively with lower precision grads.

    .. note:: In ``summon_full_params``, parameters are summoned in full
        precision but buffers are not.

    .. note:: Parameters and buffers are checkpointed in full precision. For
        buffers, this is only guaranteed to work for ``StateDictType.FULL_STATE_DICT``.

    .. note:: This API is experimental and subject to change.

    .. note:: Specification of reduced precision types must be explicit, in that
        if, for example, ``param_dtype`` is not specified, it will not be cast by
        FSDP. Thus, a config such as ``MixedPrecision(reduce_dtype=torch.float16)``
        will not cast buffers or parameters. Note that if a ``MixedPrecision``
        config is specified without a ``reduce_dtype``, gradient communication
        would occur in the `param_dtype` precision, if given, otherwise, in the
        original parameter precision.
    Nparam_dtypereduce_dtypebuffer_dtypeFkeep_low_precision_grads)rV   rW   rX   rY   r_   r   torchdtype__annotations__r`   ra   rb   boolr]   r]   r]   r^   rI      s
   
+c                   @   s   e Zd ZU dZdZeed< dS )rJ   a  
    CPU offloading config. Currently, only parameter and gradient CPU
    offload are supported.
    offload_params: Offloading parameters to CPUs when these parameters are
                    not used for computation on GPUs. This implicitly enables
                    gradient offloading to CPUs in order for parameters and
                    gradients to be on the same device to work with optimizer.
    Foffload_paramsN)rV   rW   rX   rY   rg   rf   re   r]   r]   r]   r^   rJ      s   
	c                   @   s   e Zd ZdZe Ze ZdS )rK   a  
    Specify where to prefetch next layer's full parameters
    during backward pass.
    BACKWARD_PRE: prefetch right before current layer's backward computation
                  starts, this approach will increase backward communication
                  and computation overalpping and potentialy improve training
                  performance, but it may increase the peak memory usage as
                  the prefetched full parameters will be kept in the GPU memory
                  until next layer's backward computation is done.
    BACKWARD_POST: prefetch right after current layer's backward computation finishes,
                   this approach will not increase peak memory as prefetching happens
                   after current layer's full parameters are freed.
                   It could potentially improve backward communication and computation
                   overlapping as it avoids all_gather and reduce_scatter are blocked
                   each other in the single NCCL stream. However, based on our experiments,
                   for some models, the backward post backward hook fire order is not always
                   the reversed forward computation order, so this
                   approach may prefetch full parameters for layers ahead of next layer,
                   this 'ahead' all_gather could delay next layer's all_gather in the
                   single NCCL stream and cause the next layer's computation delay. So it may
                   cause some performance regession for some models.
    N)rV   rW   rX   rY   r   BACKWARD_PREBACKWARD_POSTr]   r]   r]   r^   rK      s   c                   @   s.   e Zd ZdZe Ze Ze Ze Ze Z	dS )rR   a  
    Simple enum to indicate what state FSDP is in. Used for asserting
    to make sure APIs are called in the correct state.
    ..note::
        ``BACKWARD_PRE`` and ``BACKWARD_POST`` states are used to ensure we
        receives backward hooks in the correct order. It is used to catch
        unexpected order of hooks being called (likely due to our
        hook registration logic or autograd engine logic changes).
    N)
rV   rW   rX   rY   r   IDLEFORWARDrh   ri   SUMMON_FULL_PARAMSr]   r]   r]   r^   rR     s   
c                   @   s"   e Zd ZdZe Ze Ze ZdS )rL   a  
    This enum indicates that which type of ``state_dict`` the FSDP module is
    currently processing (returning or loading).
    The default value is FULL_STATE_DICT to comply the PyTorch convention.
    ..note::
        FSDP currently supports three types of ``state_dict``:
            1. ``state_dict/load_state_dict`: this pair of APIs return and load
               the non-sharded, unflattened parameters. The semantics is the
               same as using DDP.
            2. ``_local_state_dict/_load_local_state_dict``: this pair of APIs return
               and load local sharded, flattened parameters. The values returned
               by ``_local_state_dict`` can be directly used by FSDP and is only
               meaningful to FSDP (because parameters are flattened). Note that
               these APIs are meant for use via the :func:`state_dict_type`
               context manager as follows:
                   >>> # xdoctest: +SKIP("undefined variables")
                   >>> with fsdp.state_dict_type(StateDictType.LOCAL_STATE_DICT):
                   ...     state = fsdp.state_dict()  # loads local state dict
            3. ``_sharded_state_dict/_load_sharded_state_dict``: this pair of APIs
               return and load sharded, unflattened parameters. The ``state_dict``
               return by ``sharded_state_dict`` can be used by all other parallel
               schemes (resharding may be required).
    N)rV   rW   rX   rY   r   FULL_STATE_DICTLOCAL_STATE_DICTSHARDED_STATE_DICTr]   r]   r]   r^   rL     s   c                   @   s   e Zd ZdZdS )rM   a  
    ``StateDictConfig`` is the base class for all state_dict configuration classes.
    Users should instantiate a child version (i.e. ``FullStateDictConfig``) in
    order to configure settings for the particular type of ``state_dict``
    implementation FSDP will use.
    N)rV   rW   rX   rY   r]   r]   r]   r^   rM   6  s   c                   @   s*   e Zd ZU dZdZeed< dZeed< dS )rN   a  
    ``FullStateDictConfig`` is a config class meant to be used with
    ``StateDictType.FULL_STATE_DICT``. Currently, it accepts two parameters,
    ``offload_to_cpu`` and ``rank0_only`` which can be configured to offload
    the full ``state_dict`` to CPU and to materialize the ``state_dict`` on
    rank 0 only. When used, it is recommended to enable both of these flags
    together to optimize memory savings when taking checkpoints. Note that
    this config class is meant for user via the :func:`state_dict_type`
    context manager as follows:
        >>> # xdoctest: +SKIP("undefined variables")
        >>> fsdp = FSDP(model, auto_wrap_policy=...)
        >>> cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
        >>> with FullyShardedDataParallel.state_dict_type(fsdp, StateDictType.FULL_STATE_DICT, cfg):
        >>>     state = fsdp.state_dict()
        >>>     # state will be empty on non rank 0 and contain CPU tensors on rank 0.
        >>> # To reload checkpoint for inference, finetuning, transfer learning, etc:
        >>> model = model_fn() # Initialize model on CPU in preparation for wrapping with FSDP
        >>> if dist.get_rank() == 0:
        >>>     # Load checkpoint only on rank 0 to avoid memory redundancy
        >>>     state_dict = torch.load("my_checkpoint.pt")
        >>>     model.load_state_dict(state_dict)
        >>> # All ranks initialize FSDP module as usual. ``sync_module_states`` argument
        >>> # communicates loaded checkpoint states from rank 0 to rest of the world.
        >>> fsdp = FSDP(model, device_id=torch.cuda.current_device(), auto_wrap_policy=..., sync_module_states=True)
        >>> # After this point, all ranks have FSDP model with loaded checkpoint.
    Foffload_to_cpu
rank0_onlyN)rV   rW   rX   rY   rp   rf   re   rq   r]   r]   r]   r^   rN   @  s   
c                   @   s   e Zd ZdS )rO   NrV   rW   rX   r]   r]   r]   r^   rO   _  s   c                   @   s   e Zd ZdS )rP   Nrr   r]   r]   r]   r^   rP   c  s   c                   @   s   e Zd Ze Ze ZdS )rQ   N)rV   rW   rX   r   
PARAM_NAMEPARAM_IDr]   r]   r]   r^   rQ   m  s   .c                   @   s"   e Zd ZdZe Ze Ze ZdS )_ExecOrderWarnStatusz/Used internally for execution order validation.N)rV   rW   rX   rY   r   NONEWARNINGWARNEDr]   r]   r]   r^   ru   x  s   ru   c                   @   s   e Zd ZdZejeeddddZdejdddd	Z	e
ee
 d
ddZe
ee
 d
ddZee ddddZee eddddZe
eddddZe
eee df dddZeedf eee  dddZe
eee  ddd Zd!d" ZdS )#_ExecOrderDataae  
    This contains the data structures to track the execution order. We track
    the pre-forward order on the *first* iteration for forward prefetching
    (which thus assumes static graph) and the post-forward order on *every*
    iteration for backward prefetching (which thus does not assume static
    graph but may be provide an incorrect order).
    N)debug_levelbackward_prefetch_limitforward_prefetch_limitreturnc                 C   sp   g | _ i | _g | _i | _d| _|| _|| _|tjj	tjj
fk| _d | _d | _g | _i | _i | _d| _tj| _d S )NTr   )handles_pre_forward_order"handles_to_pre_forward_order_indexhandles_post_forward_order#handles_to_post_forward_order_indexis_first_iter_backward_prefetch_limit_forward_prefetch_limitdist
DebugLevelINFOZDETAIL_checking_orderprocess_group
world_sizeall_handleshandle_to_handle_index"flat_param_to_prefixed_param_namescurrent_order_indexru   rv   warn_status)selfrz   r{   r|   r]   r]   r^   __init__  s     z_ExecOrderData.__init__rG   )	fsdp_rootr   r}   c                 C   sv   || _ | | _| | _||D ]0}|jD ]$}t| j}| j| || j	|< q.q$t
tttt f t|| _dS )z
        Initializes the data structures needed for checking the forward order.
        This should be called after a root FSDP instance has been set during
        lazy initialization.
        N)r   ranksizer   fsdp_modules_handleslenr   appendr   r   r	   r6   r   str _get_param_to_unflat_param_namesr   )r   r   r   fsdp_modulehandleindexr]   r]   r^   init  s    




z_ExecOrderData.initcurrent_handles_keyr}   c                 C   s^   | j |d}|dkrdS |d }g }t| jD ](}|dk r@ qZ|| j|  |d8 }q0|S )z
        Returns a :class:`list` of the handles keys of the handles to backward
        prefetch given the current handles key. If there are no valid handles
        keys to prefetch, then this returns an empty :class:`list`.
        Nr"   r   )r   getranger   r   r   r   r   Zcurrent_indexZtarget_indextarget_handles_keys_r]   r]   r^    get_handles_to_backward_prefetch  s    	
z/_ExecOrderData.get_handles_to_backward_prefetchc                 C   sd   | j |d}|dkrdS |d }g }t| jD ].}|t| jkrF q`|| j|  |d7 }q0|S )z
        Returns a :class:`list` of the handles keys of the handles to forward
        prefetch given the current handles key. If there are no valid handles
        keys to prefetch, then this returns an empty :class:`list`.
        Nr"   )r   r   r   r   r   r~   r   r   r]   r]   r^   get_handles_to_forward_prefetch  s    	
z._ExecOrderData.get_handles_to_forward_prefetchhandlesr}   c                 C   sB   |sdS t |}|| jkrdS t| j}|| j|< | j| dS )a  
        Records ``handles`` in the post-forward order, where ``handles`` should
        be a group of handles used in the same module's forward. If ``handles``
        is empty, then it is omitted.

        Unlike :meth:`record_pre_forward`, this records the order *every*
        iteration with the expectation that the recorded order is reset in
        :meth:`next_iter`.
        N)tupler   r   r   r   )r   r   handles_keyr   r]   r]   r^   record_post_forward  s    



z"_ExecOrderData.record_post_forward)r   is_trainingr}   c                 C   sT   |sdS t |}| || | jr,|| jkr0dS t| j}|| j|< | j| dS )ab  
        Records ``handles`` in the pre-forward order on the first iteration,
        where ``handles`` should be a group of handles used in the same
        module's forward. If ``handles`` is empty, then it is omitted.

        On the first iteration, this checks the execution order across ranks.
        See :meth:`_check_order` for details.
        N)r   _check_orderr   r   r   r~   r   )r   r   r   r   r   r]   r]   r^   record_pre_forward  s    	

z!_ExecOrderData.record_pre_forward)r   r   r}   c                    sB  |sdS | j rd}| |}|d j}tdd |D  tj|d}tj| jf|tj gf|}t	j
|| jd tfddt| jD d	D ]>\\}}	\}
}|	|krt| d
| d|	 d|
 d| d
qtj| j  f|tj|f|}t	j
|| jd t fddt| jD d	D ]T\\}}\}
}||kr.| |}| |}t| d
| d| d|
 d| 	q.n| jr>| jtjkrdS d}| jt| jkrd}n,| j| j }||kr| |}d| d}|dk	r0| |}|r| nd}td| j d| |  tj| _|  jd7  _dS )a  
        Checks the forward execution order as long as ``is_training`` is
        ``True`` since checking in eval mode is not supported.

        - On the first iteration, this uses all-gathers to check that all ranks
        are all-gathering the same handles and hence ``FlatParameter`` s,
        raising an error if not.
        - On subsequent iterations, if the distributed debug level is at least
        INFO, then this checks that each rank is locally consistent with its
        own forward order from the first iteration, issuing a warning if not.
        This issues a warning on the first deviating iteration and stops
        warning thereafter.
        Nz#Forward order differs across ranks:r   c                 s   s   | ]}|d k	V  qd S Nr]   ).0r   r]   r]   r^   	<genexpr>;  s     z._ExecOrderData._check_order.<locals>.<genexpr>rd   devicegroupc                 3   s   | ]}| | fV  qd S r   r]   r   r   )world_num_valid_indicesr]   r^   r   I  s      z rank z is all-gathering z parameters while rank z parametersc                 3   s*   | ]"}||  |d     fV  qdS )r"   Nr]   r   )num_valid_indicesworld_indicesr]   r^   r   ]  s    
z! is all-gathering parameters for z while rank zfExpected to not all-gather any more parameters in the forward but trying to all-gather parameters for zExpected to all-gather for z) but trying to all-gather parameters for z/a newly-added parameter since construction timez?Forward order differs from that of the first iteration on rank zD. Collectives are unchecked and may give incorrect results or hang.
r"   )r   _get_handle_indicesr   sumrc   Zint32zerosr   tensorr   _all_gather_baser   	itertoolscombinationsr   RuntimeError_get_names_from_handle_indicesr   r   ru   rx   r   r   r~   _get_names_from_handleswarningswarnr   rw   )r   r   r   
msg_prefixZlocal_indicesr   Ztensor_kwargsZlocal_num_valid_indicesZr1Zn1Zr2Zn2i1i2Zr1_param_namesZr2_param_namesZexpected_handles_keyZexpected_param_namesparam_namesZ
msg_suffixr]   )r   r   r   r^   r   #  s    
	
   	








z_ExecOrderData._check_order.r   r}   c                 C   s<   g }|D ]*}|| j kr"|d q|| j |  qt|S )z
        Returns the handle indices (i.e. indices into ``self.all_handles``)
        corresponding to the handles in ``handles_key``. An entry in the
        returned tuple is ``None`` if the handle is invalid.
        N)r   r   r   )r   r   indicesr   r]   r]   r^   r     s    	
z"_ExecOrderData._get_handle_indices)handle_indicesr}   c                 C   sR   g }|D ]D}|dks|dk s|t | jkr,q| j| }|j}|| j|  q|S )z
        Returns a list of prefixed parameter names for each handle in
        ``handle_indices``. If a handle index is invalid, then its prefixed
        parameter names are omitted from the returned list.
        Nr   )r   r   
flat_paramr   r   )r   r   prefixed_param_namesr   r   r   r]   r]   r^   r     s    	
z-_ExecOrderData._get_names_from_handle_indicesc                 C   s4   g }|D ]&}|j }|| jkrq|| j|  q|S )z
        Returns a list of prefixed parameter names for each handle in
        ``handles_key``. If a handle is invalid, then its prefixed parameter
        names are omitted from the returned list.
        )r   r   r   )r   r   r   r   r   r]   r]   r^   r     s    	
z&_ExecOrderData._get_names_from_handlesc                 C   s>   d| _ | j  | j  | jr:d| _| jtjkr:tj	| _dS )z
        Advances the internal data structures per iteration. This should be
        called in the post-backward callback since that marks the true end of
        an iteration.
        Fr   N)
r   r   clearr   r   r   r   ru   rw   rx   r   r]   r]   r^   	next_iter  s    

z_ExecOrderData.next_iter)rV   rW   rX   rY   r   r   intr   r   r   _HandlesKeyr   r   r   r7   r   rf   r   r   r   r   r   r   r   r   r   r]   r]   r]   r^   ry     s:   
(v


ry   c                   @   s^   e Zd ZdZddddZejjddddZe	ejj dd	d
Z
e	ejj dddZdS )_FreeEventQueuez
    This tracks all pending frees corresponding to inflight all-gathers. The
    queueing pattern is iterative enqueues with a single dequeue per iteration
    once the limit ``_max_num_inflight_all_gathers`` is reached.
    Nr}   c                 C   s   t  | _d| _d S )Nr   )collectionsdeque_queue_max_num_inflight_all_gathersr   r]   r]   r^   r     s    
z_FreeEventQueue.__init__)
free_eventr}   c                 C   s   | j | dS )zEnqueues a free event.N)r   r   )r   r   r]   r]   r^   enqueue  s    z_FreeEventQueue.enqueuec                 C   s   t | j| jkr|  S dS )z0Dequeues a single event if the limit is reached.N)r   r   r   _dequeuer   r]   r]   r^   dequeue_if_needed  s    z!_FreeEventQueue.dequeue_if_neededc                 C   s   | j r| j  }|S dS )z"Dequeues a free event if possible.N)r   popleft)r   eventr]   r]   r^   r     s    
z_FreeEventQueue._dequeue)rV   rW   rX   rY   r   rc   cudaEventr   r   r   r   r]   r]   r]   r^   r     s
   r   c                       s
  e Zd ZdZdejee ee ee	 ee
 ee ee eeejj  ee
ejgdf  eeeejf  eeed fddZejeeejj  eej ddd	Zejjeejj eeejj ee f d
ddZejee dddZeeef eeef ddddZejeej ddddZeeeejf  eej dddZejee
ejgdf  eej ddddZ ejeej eej dddZ!ejeej eej ejdd d!Z"eje#ej dd"d#d$Z$ejeej e%ej dd%d&Z&eej dd'd(d)Z'e(dd*d+d,Z)e* e#e( dd-d.d/Z+e#e( e#e dd0d1d2Z,e-ejd3d4d5Z.eed6 fd7d8Z/eed9d:d;Z0ed3d<d=Z1e2dejee#d  d>d?d@Z3e
ejgdf d dA fdBdCZ4ed3dDdEZ5ed3dFdGZ6ed3dHdIZ7ed3dJdKZ8ed3dLdMZ9ej:eeeeef dNdOdPZ;deej eeeej:f  ee eddRdSdTZ<dd3dUdVZ=dd3dWdXZ>e* e(dd*dYdZZ?dd3d[d\Z@dd3d]d^ZAeBdd_d`daZCeBe#eB d_dbdcZDeBeEdddedfZFe2eGjHdejeIeeJ eKdgdhdiZLeedjdkdlZMe-e%eeeef  d3dmdnZNeeef eeeef dodpdqZOeeef eeeef dodrdsZPe* eeef eeeef dodtduZQe2ejeeef eeeeef dvdwdxZR fdydzZSeeed{d|d}ZTdd3d~dZUeeed{ddZVeeef eddoddZWdd3ddZXeeef eddoddZYdd3ddZZeeef eddoddZ[e2ejeeef eeddvddZ\e2ejeddddZ]e^eef e_d fddZ`e^eef e_dddZadeeeejbf df ee_dddZceeed{ddZde#e( ee
 ejedddZee#e( dd-ddZfe#e( ee
 ejeeedddZgdd Zhdd Zie2eGjHdeeeeeKdddZjeGjHdeeeedddZke* e#e( dddZle%eeejbf  d3 fddZme%eeejjf  d3 fddZnee#e( edddZoe#e( dd-ddZpe* e(eddddZqejberdddZse(dddZtdd3ddĄZue* dd3ddƄZvdd3ddȄZweexe#ex f ddɜdd˄ZyeHeKd3dd̈́Zze-e#e d3ddτZ{e* dee|ef ee|ef ddќddӄZ}e2ddՄ Z~e2ed3ddׄZe2dejjejjeee#eeef  eejj f  eeej eeef d؜ddڄZe2d ejjejjeee#eeef  eejj f  eej eeef dۜdd݄Ze2deeef ejjeee#eeef  eejj f  eejj eeef dޜddZe2deeef ejjeee#eeef  eejj f  eejj eeef dddZe2deeeef  ejjeee#eeef  eejj f  eejj ee eeef dddZe2deeef eejjeee#eeef  eejj f  eejj eeef dddZed3ddZed3ddZeedddZdd3ddZed3ddZed3ddZ  ZS (  rG   a62  
    A wrapper for sharding Module parameters across data parallel workers. This
    is inspired by `Xu et al.`_ as well as the ZeRO Stage 3 from DeepSpeed_.
    FullyShardedDataParallel is commonly shortened to FSDP.

    .. _`Xu et al.`: https://arxiv.org/abs/2004.13336
    .. _DeepSpeed: https://www.deepspeed.ai/

    Example::

        >>> # xdoctest: +SKIP("undefined variables")
        >>> import torch
        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        >>> torch.cuda.set_device(device_id)
        >>> sharded_module = FSDP(my_module)
        >>> optim = torch.optim.Adam(sharded_module.parameters(), lr=0.0001)
        >>> x = sharded_module(x, y=3, z=torch.Tensor([1]))
        >>> loss = x.sum()
        >>> loss.backward()
        >>> optim.step()

    .. warning::
        The optimizer must be initialized *after* the module has been wrapped,
        since FSDP will shard parameters in-place and this will break any
        previously initialized optimizers.

    .. warning::
        If the destination CUDA device has ID ``dev_id``, either (1)
        ``module`` should already be placed on that device, (2) the device
        should be set using ``torch.cuda.set_device(dev_id)``, or (3)
        ``dev_id`` should be passed into the ``device_id`` constructor
        argument. This FSDP instance's compute device will be that destination
        device. For (1) and (3), the FSDP initialization always occurs on GPU.
        For (2), the FSDP initialization happens on ``module`` 's current
        device, which may be CPU.

    .. warning::
        FSDP currently does not support gradient accumulation outside
        ``no_sync()`` when using CPU offloading. Trying to do so yields
        incorrect results since FSDP will use the newly-reduced gradient
        instead of accumulating with any existing gradient.

    .. warning::
        Changing the original parameter variable names after construction will
        lead to undefined behavior.

    .. warning::
        Passing in `sync_module_states=True` flag requires module to be put
        on GPU, or to use ``device_id`` argument to specify a CUDA device that
        FSDP will move module to. This is because ``sync_module_states=True``
        requires GPU communication.

    .. warning::
        As of PyTorch 1.12, FSDP only offers limited support for shared parameters
        (for example, setting one ``Linear`` layer's weight to another's). In
        particular, modules that share parameters must be wrapped as part of the
        same FSDP unit. If enhanced shared parameter support is needed for your
        use case, please ping https://github.com/pytorch/pytorch/issues/77724

    .. note::
        Inputs into FSDP ``forward`` function will be moved to compute device
        (same device FSDP module is on) before running ``forward``, so user does
        not have to manually move inputs from CPU -> GPU.

    Args:
        module (nn.Module):
            module to be wrapped with FSDP.
        process_group (Optional[ProcessGroup]):
            process group for sharding
        sharding_strategy (Optional[ShardingStrategy]):
            Config sharding algorithm, different sharding algorithm has trade
            off between memory saving and communication overhead. ``FULL_SHARD``
            will be chosen if sharding_strategy is not specified.
        cpu_offload (Optional[CPUOffload]):
            CPU offloading config. Currently, only parameter and gradient CPU
            offload is supported. It can be enabled via passing in
            ``cpu_offload=CPUOffload(offload_params=True)``. Note that this
            currently implicitly enables gradient offloading to CPU in order for
            params and grads to be on same device to work with optimizer. This
            API is subject to change. Default is ``None`` in which case there
            will be no offloading.
        auto_wrap_policy (Optional[Callable[[nn.Module, bool, int], bool]]):
            A callable specifying a policy to recursively wrap layers with FSDP.
            Note that this policy currently will only apply to child modules of
            the passed in module. The remainder modules are always wrapped in
            the returned FSDP root instance.
            ``size_based_auto_wrap_policy`` written in ``torch.distributed.fsdp.wrap`` is
            an example of ``auto_wrap_policy`` callable, this policy wraps layers
            with the number of parameters larger than 100M. ``transformer_auto_wrap_policy``
            written in ``torch.distributed.fsdp.wrap`` is an example of ``auto_wrap_policy``
            callable for transformer-like model architectures. Users can supply the customized
            ``auto_wrap_policy`` callable that should accept following arguments:
            ``module: nn.Module``, ``recurse: bool``, ``unwrapped_params: int``, and return
            a ``bool`` specifying whether the passed in ``module``` should be wrapped
            (if ``recurse=False``) or whether we should recurse down the subgraph of ``module``
            children (if ``recurse=True``). Extra customized arguments could be added to
            the customized ``auto_wrap_policy`` callable as well. It is a good practice to
            print out the sharded model and check whether the sharded model is what
            the application wants and then adjust accordingly.

            Example::

                >>> def custom_auto_wrap_policy(
                >>>     module: nn.Module,
                >>>     recurse: bool,
                >>>     unwrapped_params: int,
                >>>     # These are customizable for this policy function.
                >>>     min_num_params: int = int(1e8),
                >>> ) -> bool:
                >>>     return unwrapped_params >= min_num_params
                >>> # Configure a custom min_num_params
                >>> my_auto_wrap_policy = functools.partial(custom_auto_wrap_policy, min_num_params=1e5)

        backward_prefetch (Optional[BackwardPrefetch]):
            This is an experimental feature that is subject to change in the
            the near future. It allows users to enable two different backward_prefetch
            algorithms to help backward communication and computation overlapping.
            Pros and cons of each algorithm is explained in the class ``BackwardPrefetch``.
        mixed_precision (Optional[MixedPrecision]): A ``MixedPrecision`` instance
            describing the mixed precision training config to be used. ``MixedPrecision``
            supports configuring parameter, buffer, and gradient communication dtype. Note
            that only floating point data is cast to the reduced precision. This allows
            users potential memory saving and training speedup while trading off
            accuracy during model training. If ``None``, no mixed precision is applied.
            Note that if ``mixed_precision`` is enabled for FSDP model that
            contains ``BatchNorm`` with ``auto_wrap_policy``, FSDP will take
            care to disable mixed precision for ``BatchNorm`` units by wrapping
            them separately in their own FSDP unit with ``mixed_precision=None``.
            This is done because several ``BatchNorm`` kernels do not implement
            reduced type support at the moment. If individually wrapping the model,
            users must take care to set ``mixed_precision=None`` for
            ``BatchNorm`` units.
            (Default: ``None``)
        ignored_modules (Optional[Iterable[torch.nn.Module]]): Modules whose
            own parameters and child modules' parameters and buffers are
            ignored by this instance. None of the modules directly in
            ``ignored_modules`` should be :class:`FullyShardedDataParallel`
            instances, and any child modules that are already-constructed
            :class:`FullyShardedDataParallel` instances will not be ignored if
            they are nested under this instance. This argument may be used to
            avoid sharding specific parameters at module granularity when using an
            ``auto_wrap_policy`` or if parameters' sharding is not managed by
            FSDP. (Default: ``None``)
        param_init_fn (Optional[Callable[[nn.Module], None]]):
            A ``Callable[torch.nn.Module] -> None`` that
            specifies how modules that are currently on the meta device should be initialized
            onto an actual device. Note that as of v1.12, we detect modules on the meta
            device via ``is_meta`` check and apply a default initialization that calls
            ``reset_parameters`` method on the passed in ``nn.Module`` if ``param_init_fn``
            is not specified, otherwise we run ``param_init_fn`` to initialize the passed
            in ``nn.Module``. In particular, this means that if ``is_meta=True`` for any
            module parameters for modules that will be wrapped with FSDP and ``param_init_fn``
            is not specified, we assume your module properly implements a ``reset_paramters()``
            and will throw errors if not. Note that additionally, we offer support for modules
            initialized with torchdistX's (https://github.com/pytorch/torchdistX)
            ``deferred_init`` API. In this case, deferred modules would be initialized
            by a default initialization function that calls torchdistX's
            ``materialize_module``, or the passed in ``param_init_fn``, if it is not
            ``None``. The same ``Callable`` is applied to initialize all meta modules.
            Note that this initialization function is applied before doing any FSDP sharding
            logic.

            Example::

                >>> # xdoctest: +SKIP("undefined variables")
                >>> module = MyModule(device="meta")
                >>> def my_init_fn(module):
                >>>     # responsible for initializing a module, such as with reset_parameters
                >>>     ...
                >>> fsdp_model = FSDP(module, param_init_fn=my_init_fn, auto_wrap_policy=size_based_auto_wrap_policy)
                >>> print(next(fsdp_model.parameters()).device) # current CUDA device
                >>> # With torchdistX
                >>> module = deferred_init.deferred_init(MyModule, device="cuda")
                >>> # Will initialize via deferred_init.materialize_module().
                >>> fsdp_model = FSDP(module, auto_wrap_policy=size_based_auto_wrap_policy)

        device_id (Optional[Union[int, torch.device]]): An ``int`` or ``torch.device``
            describing the CUDA device the FSDP module should be moved to determining where
            initialization such as sharding takes place. If this argument is not specified
            and ``module`` is on CPU, we issue a warning mentioning that this argument can
            be specified for faster initialization. If specified, resulting FSDP instances
            will reside on this device, including moving ignored modules' parameters if
            needed. Note that if ``device_id`` is specified but ``module`` is already on a
            different CUDA device, an error will be thrown. (Default: ``None``)
        sync_module_states (bool): If ``True``, each individually wrapped FSDP unit will broadcast
            module parameters from rank 0 to ensure they are the same across all ranks after
            initialization. This helps ensure model parameters are the same across ranks
            before starting training, but adds communication overhead to ``__init__``, as at least
            one broadcast is triggered per individually wrapped FSDP unit.
            This can also help load checkpoints taken by ``state_dict`` and to be loaded by
            ``load_state_dict`` in a memory efficient way. See documentation for
            :class:`FullStateDictConfig` for an example of this. (Default: ``False``)
        forward_prefetch (bool): If ``True``, then FSDP *explicitly* prefetches
            the next upcoming all-gather while executing in the forward pass.
            This may improve communication and computation overlap for CPU
            bound workloads. This should only be used for static graph models
            since the forward order is fixed based on the first iteration's
            execution. (Default: ``False``)
        limit_all_gathers (bool): If ``False``, then FSDP allows the CPU
            thread to schedule all-gathers without any extra synchronization.
            If ``True``, then FSDP explicitly synchronizes the CPU thread to
            prevent too many in-flight all-gathers. This ``bool`` only affects
            the sharded strategies that schedule all-gathers. Enabling this can
            help lower the number of CUDA malloc retries.
    NFmoduler   sharding_strategycpu_offloadauto_wrap_policybackward_prefetchmixed_precisionignored_modulesparam_init_fn	device_idsync_module_statesforward_prefetchlimit_all_gathersc                    sd  t |tr2| j|||||||||	|
|||d d S tjd t   | ||| _	| 
|| j	\}| _| || _|d k	r||t| j	|dd}||||||	|
|||d
}| || |pt | _| j | _| j | _tj| _|pt | _|| _|| _|| _d}d}| jdkrtj}|p&tj| _ |p4t! | _"i | _#| $|| | %|
}| &||	| | '||| | (|||| _)t*| +||}|r| ,|| t-t.| j  | jj/| j"j0| j"j1| j"j2}t3||| j)|| _4| 5| g | _6g | _7| j4j8rh| j4j9}| j7:|j; | <| |=| j | jj/rh|j;j>t>dkrht?  |@t>d W 5 Q R X d| _A| B | _C| D | _Ed| _Fi | _Gd | _Hi | _ItJ | _KtLM | _NtO| jN||| _Pi | _Qi | _Ri | _StTjU| _VtW | _X| Y| jZ tTjU| j[tTj\| j]tTj^| j_i| _`| ja| jbdd	 tTjU| jctTj\| jdtTj^| jei| _f| g| jh tTjU| jitTj\| jjtTj^| jki| _ld S )
Nr   ztorch.distributed.fsdpT)r   r   Zwrapper_clsr   ignored_paramsZonly_wrap_children)
r   r   r   r   r   r   r   r   r   r   r"   cpuF)Zwith_module)m
isinstancer>   "_init_param_exec_order_wrap_policyrc   Z_CZ_log_api_usage_oncesuperr   _get_ignored_modules_ignored_modules_get_ignored_params_ignored_param_names_get_buffer_names_buffer_namesrG   
_auto_wrapr   r   r   r   r   rR   rj   training_staterJ   r   r   r   r   rH   r\   rZ   r   rI   r   _buffer_name_to_orig_dtype_check_single_device_module_get_device_from_device_id_materialize_module_move_module_to_device_get_compute_devicecompute_devicelist_get_orig_params_sync_module_statesr8   sharding_strategy_maprg   r_   r`   rb   r=   rT   _check_orig_params_flattenedr   params
has_paramsr   r   r   _register_param_handleshardr   no_grad_flat_param_to_sync_gradients_get_default_comm_hook_communication_hook_get_default_comm_hook_state_communication_hook_state_hook_registered_ran_pre_backward_hook_is_root_streamsr   _free_event_queuer   Zget_debug_levelZ_debug_levelry   _exec_order_data_handles_prefetched_needs_pre_backward_unshard_needs_pre_forward_unshardrL   rm   _state_dict_typerN   _state_dict_configZ_register_state_dict_hook_post_state_dict_hook_full_post_state_dict_hookrn   _local_post_state_dict_hookro   _sharded_post_state_dict_hook_post_state_dict_hook_fnZ"_register_load_state_dict_pre_hook_pre_load_state_dict_hook_full_pre_load_state_dict_hook_local_pre_load_state_dict_hook!_sharded_pre_load_state_dict_hook_pre_load_state_dict_hook_fnZ"register_load_state_dict_post_hook_post_load_state_dict_hook_full_post_load_state_dict_hook _local_post_load_state_dict_hook"_sharded_post_load_state_dict_hook_post_load_state_dict_hook_fn)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   auto_wrap_kwargsfsdp_kwargsr{   r|   device_from_device_idZparams_to_flattenconfigr   	__class__r]   r^   r     s   

 
	







          z!FullyShardedDataParallel.__init__)root_moduler   r}   c                 C   s   |dkrt  S d}zt |}W n* tk
rH   t|dt|  Y nX |D ]:}t|tjjsvt|dt|  t|trNtdqNt dd |D }||krt	
d|  | D ](}t|trt|d	st||j q|S )
a3  
        Checks that ``_ignored_modules`` is an iterable of ``nn.Module`` s
        without any FSDP instances, and returns the modules contained in their
        module subtrees as a :class:`set`. Nested FSDP instances are excluded,
        but their already-computed ignored modules are included.
        Nz>`ignored_modules` should be an iterable of `torch.nn.Module`s zbut got zbut got an iterable with z1`ignored_modules` should not include FSDP modulesc                 s   s.   | ]&}|  D ]}t|ttfs|V  qqd S r   )modulesr   rG   r=   )r   r   childr]   r]   r^   r     s
   
 z@FullyShardedDataParallel._get_ignored_modules.<locals>.<genexpr>zTrying to ignore the top-level module passed into the FSDP constructor itself will result in all parameters being ignored and is not well-supported: r   )set	TypeErrortyper   rc   nnModulerG   
ValueErrorr   r   r6  hasattrAssertionErrorupdater   )r   r5  r   r   Zignored_root_modulesr   r   	submoduler]   r]   r^   r     s0    


z-FullyShardedDataParallel._get_ignored_modules)r5  r   r}   c           
      C   sd   t dd |D }t|dd}t  }|D ]2}|| }g }|D ]}	|t|	 q<|| q(||fS )z
        Returns the parameters of the modules in ``ignored_modules``,
        excluding any :class:`FlatParameter` s, and their fully prefixed names,
        both as :class:`set` s.
        c                 s   s(   | ] }|  D ]}t|s|V  qqd S r   )
parametersr3   )r   mpr]   r]   r^   r     s
   
 z?FullyShardedDataParallel._get_ignored_params.<locals>.<genexpr>Fdedup_shared_params)r8  r   r   rS   r@  )
r   r5  r   r   param_to_unflat_param_namesZignored_param_namesparamZunflat_param_namesZclean_nameskr]   r]   r^   r     s    
z,FullyShardedDataParallel._get_ignored_params)r5  r}   c                 C   s>   t jttt ddd}tt ddd}t }t||||S )z
        Returns the fully prefixed names of all buffers in the module hierarchy
        rooted at ``root_module`` as a class:`set`.
        )r   prefixbuffer_namesc                 S   s:   t | ts6| jddD ]\}}t|| }|| qd S )NFrecurse)r   rG   named_buffersrS   add)r   rJ  rK  buffer_namer   Zprefixed_buffer_namer]   r]   r^   	module_fn  s    
z=FullyShardedDataParallel._get_buffer_names.<locals>.module_fn)rK  c                 W   s   | S r   r]   )rK  argsr]   r]   r^   	return_fn  s    z=FullyShardedDataParallel._get_buffer_names.<locals>.return_fn)r;  r<  r   r   r8  r/   )r   r5  rQ  rS  rK  r]   r]   r^   r     s    	z*FullyShardedDataParallel._get_buffer_names)r/  r0  r}   c                 C   s   |d }|d }|dk	st | D ]"\}}t|tr$td| dq$|d }|dk	rt|rt| tjt	t
|gd}td ||d< tf || dS )	a  
        Recursively auto wraps the root module given by the key "module" in
        ``auto_wrap_kwargs`` with the arguments in ``auto_wrap_kwargs`` and
        ``fsdp_kwargs``.

        Precondition: ``auto_wrap_policy`` contains the arguments expected by
        ``_recursive_wrap()``, where ``auto_wrap_policy`` is not ``None``.
        ``fsdp_kwargs`` contains all FSDP arguments except ``module``.
        r   r   N	Expected zB to NOT be FullyShardedDataParallel if using an `auto_wrap_policy`r   )Zpoliciesa  Both mixed precision and an `auto_wrap_policy` were specified for FSDP, where the wrapped module has batch norm submodules. The batch norm submodules will be wrapped as separate FSDP instances with mixed precision disabled since some batch norm kernels do not support low precision.)r?  Znamed_modulesr   rG   r=  r1   r4   	functoolspartialr?   rA   r   r   r@   )r   r/  r0  r   r5  module_namer   r   r]   r]   r^   r     s(    

 z#FullyShardedDataParallel._auto_wrap)r   r   r}   c                 C   s8   t dd | ||D }t|dkr4td| dS )a  
        Raises an error if ``module`` has original parameters on multiple
        devices, ignoring the parameters in ``ignored_params``. Thus, after
        this method, the module must be either fully on the CPU or fully on a
        non-CPU device.
        c                 s   s   | ]}|j V  qd S r   r   r   rH  r]   r]   r^   r     s    zGFullyShardedDataParallel._check_single_device_module.<locals>.<genexpr>r"   z;FSDP only supports single device modules but got params on N)r8  r  r   r   )r   r   r   Zdevicesr]   r]   r^   r     s    
z4FullyShardedDataParallel._check_single_device_module)r   r}   c              	   C   sp   |dkrdS t |tjr|nt|}|tdkrltd| d| j dtj  d tdtj }|S )z	
        Nr   z"FSDP got the argument `device_id` z	 on rank zJ, which does not have an explicit index. FSDP will use the current device z. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.)r   rc   r   r   r   r   r   current_device)r   r   r   r]   r]   r^   r      s    
z3FullyShardedDataParallel._get_device_from_device_id)r   r   r1  r}   c              
   C   s
  t dd | D }| o4to4t dd | D }|s>|rp|dk	rpt|sftd| dt| || n|r|ptj }|j	|d z t
  |  W 5 Q R X W n< tk
r } ztdt| d	 |W 5 d}~X Y nX n|rtj|d
d d dS )ae  
        Materializes the wrapped module ``module`` in place if needed: either
        if the module has parameters that use meta device or are torchdistX
        fake tensors.

        This method uses ``param_init_fn`` to materialize the module if the
        function is not ``None`` and falls back to default behavior otherwise.
        For meta device, this moves the module to ``device_from_device_id`` if
        it is not ``None`` or the current device otherwise and calls
        ``reset_parameters()``, and for torchdistX fake tensors, this calls
        ``deferred_init.materialize_module()``.
        c                 s   s   | ]}|j V  qd S r   )is_metar   rD  r]   r]   r^   r   K  s     z?FullyShardedDataParallel._materialize_module.<locals>.<genexpr>c                 s   s   | ]}t |V  qd S r   )rC   Zis_faker\  r]   r]   r^   r   O  s     NrT  z to be callable but got rX  zIUnable to call `reset_parameters()` for module on meta device with error zE. Please ensure your module implements a `reset_parameters()` method.c                 S   s   t | t S r   )r   rG   )rI  r]   r]   r^   <lambda>k      z>FullyShardedDataParallel._materialize_module.<locals>.<lambda>)Zcheck_fn)anyrB  _TORCHDISTX_AVAILcallabler=  r:  rc   r   rZ  Zto_emptyr  Zreset_parametersBaseExceptionr   r   r   rB   Zmaterialize_module)r   r   r   r1  Zis_meta_moduleZis_torchdistX_deferred_initZmaterialization_deviceer]   r]   r^   r  9  sB    

z,FullyShardedDataParallel._materialize_module)r   r   r1  c              
   C   s   t d}t| ||d}|dkr(dS |dk	r|j|kr||}| D ]F}t|trL|jj	rLt 
 " |jD ]}|t d qrW 5 Q R X qLn|j|krtd dS )a  
        Moves ``module`` depending on ``device_from_device_id`` and its current
        device. This includes moving ignored modules' parameters.

        - If ``device_from_device_id`` is not ``None``, then this moves
        ``module`` to the device.
        - If ``device_from_device_id`` is ``None``, then this does not move
        ``module`` but warns the user if it is on CPU.

        Precondition: ``_check_single_device_module()``.
        r   NaA  Module is put on CPU and will thus have flattening and sharding run on CPU, which is less efficient than on GPU. We recommend passing in `device_id` argument which will enable FSDP to put module on GPU device, module must also be on GPU device to work with `sync_module_states=True` flag which requires GPU communication.)rc   r   nextr  tor6  r   rG   r   rg   r  r   r  r   r   )r   r   r   r1  
cpu_devicerH  rA  r   r]   r]   r^   r  n  s&    




 
z/FullyShardedDataParallel._move_module_to_device)r   r   r1  r}   c                 C   sp   t | ||d}|dk	r.|jjdkr.|j}ntdtj }|dk	rl||krltd| j d| d| |S )aI  
        Determines and returns this FSDP instance's compute device. If the
        module is already on a non-CPU device, then the compute device is that
        non-CPU device. If the module is on CPU, then the compute device is the
        current device.

        Since this method should be called after materializing the module, any
        non-CPU device should not be meta device. For now, the compute device
        is always a CUDA GPU device with its explicit index.

        Precondition: ``_check_single_device_module()`` and
        ``_move_module_to_device()``.
        Nr   z4Inconsistent compute device and `device_id` on rank : z vs )	rd  r  r   r:  rc   r   rZ  r=  r   )r   r   r   r1  rH  r  r]   r]   r^   r    s    z,FullyShardedDataParallel._get_compute_device)r   r
  r}   c                 C   sz   |rt dd |D rtdg }| D ]$}t|dds*d|_||  q*|dd |D  t| j	|t
dd	 d
S )a  
        Synchronizes module states (i.e. parameters ``params`` and all
        not-yet-synced buffers) by broadcasting from rank 0 to all ranks.

        Precondition: ``sync_module_states == True`` and ``self.process_group``
        has been set.
        c                 s   s   | ]}|j t d kV  qdS )r   N)r   rc   rY  r]   r]   r^   r     s     z?FullyShardedDataParallel._sync_module_states.<locals>.<genexpr>zModule has CPU parameters, but sync_module_states=True is specified.This only works for GPU module, please specify `device_id` argument or move module to GPU before init._fsdp_syncedFTc                 s   s   | ]}|  V  qd S r   )detachrY  r]   r]   r^   r     s     r   )srcN)r_  r=  buffersgetattrrh  r   ri  extendr   r   _PARAM_BROADCAST_BUCKET_SIZE)r   r   r
  Zmodule_statesbufferr]   r]   r^   r    s     
   z,FullyShardedDataParallel._sync_module_statesc                 c   sF   |  }z$t|}||kr
t|s
|V  q
W n tk
r@   Y nX dS )z
        Returns an iterator over the original parameters in ``module``,
        ignoring the parameters in ``ignored_params`` and any ``FlatParameter``
        s (which may be present due to nested FSDP wrapping).
        N)rB  rd  r3   StopIteration)r   r   r   Z	param_genrH  r]   r]   r^   r    s    
z)FullyShardedDataParallel._get_orig_params)r   r}   c                 C   sF   |   D ]8\}}||krt|std| d|  d|j qdS )z
        Checks that all original parameters have been flattened and hence made
        invisible to ``named_parameters()``. This should be called as a sanity
        check after flattening the wrapped module's parameters.
        z Found an unflattened parameter: z;  N)named_parametersr3   r   r   r4  )r   r   
param_namerH  r]   r]   r^   r	    s
    z5FullyShardedDataParallel._check_orig_params_flattened)r   r}   c                 C   s   || j kr| j | dS )z5Registers the parameter handle to this FSDP instance.N)r   r   r   r   r]   r]   r^   r    s    
z/FullyShardedDataParallel._register_param_handler   c              	   C   s   |sdS | j r$| j }|r$|  d}tj| jd   |D ]}| }|pR|}q@W 5 Q R X |rz| jd 	| jd  tj| jd   |D ]}|
  |  qW 5 Q R X dS )aI  
        Unshards the handles in ``handles``. If the handles are in
        :meth:`summon_full_params` and are using mixed precision, then they are
        forced to full precision.

        Postcondition: Each handle's ``FlatParameter`` 's data is the padded
        unsharded flattened parameter on the compute device.
        NFpre_all_gather
all_gather)r   r  r   synchronizerc   r   streamr  Zpre_unshardwait_streamZunshardZpost_unshard)r   r   r   Zany_ran_pre_unshardr   Zran_pre_unshardr]   r]   r^   _unshard  s"    
z!FullyShardedDataParallel._unshard)r   free_unsharded_flat_paramsr}   c                 C   s   |sdS t t|t|kdt| dt|  t||D ]B\}}|| | jrv|rvtj }|  | j	
| |  q<t|}t|r| j|d dS )a  
        Reshards the handles in ``handles``. ``free_unsharded_flat_params``
        should have the same length as ``handles``, and each element should
        give whether the corresponding handle should free its padded unsharded
        flattened parameter.
        Nz0Expects both lists to have equal length but got z and )r5   r   zipZreshardr   rc   r   r   recordr  r   Zpost_reshardr   r_  r  pop)r   r   r{  r   free_unsharded_flat_paramr   r   r]   r]   r^   _reshard   s&    



z!FullyShardedDataParallel._reshardr   c                 C   s   t | jtst| jjS )zU
        Returns the wrapped module (like :class:`DistributedDataParallel`).
        )r   rT   r=   r?  r   r   r]   r]   r^   r   B  s    zFullyShardedDataParallel.module)namer}   c                    s4   zt  |W S  tk
r.   t| j| Y S X dS )z-Forward missing attributes to wrapped module.N)r   __getattr__AttributeErrorrl  rT   )r   r  r3  r]   r^   r  J  s    z$FullyShardedDataParallel.__getattr__)keyr}   c                 C   s   | j |S )z=Forward indexing calls in case the module is a nn.Sequential.)rT   __getitem__)r   r  r]   r]   r^   r  Q  s    z$FullyShardedDataParallel.__getitem__c                 C   s   |    | jd k	st| jS r   )
_lazy_initr  r?  r   r]   r]   r^   check_is_rootU  s    z&FullyShardedDataParallel.check_is_root)r   	root_onlyr}   c                    s    fdd|   D S )a  
        Returns all nested FSDP instances, possibly including ``module`` itself
        and only including FSDP root modules if ``root_only=True``.

        Args:
            module (torch.nn.Module): Root module, which may or may not be an
                ``FSDP`` module.
            root_only (bool): Whether to return only FSDP root modules.
                (Default: ``False``)

        Returns:
            List[FullyShardedDataParallel]: FSDP modules that are nested in
            the input ``module``.
        c                    s&   g | ]}t |tr r| r|qS r]   )r   rG   r  )r   rA  r  r]   r^   
<listcomp>m  s
   
 z9FullyShardedDataParallel.fsdp_modules.<locals>.<listcomp>)r6  )r   r  r]   r  r^   r   Z  s    
z%FullyShardedDataParallel.fsdp_modules)fnr}   c              	      sb   | j dk}| tj | jddd t |}W 5 Q R X |r^| j r^| | D ]}|  qP|S )a:  Applies ``fn`` recursively to every submodule (as returned by ``.children()``)
        as well as self. Typical use includes initializing the parameters of a model
        (see also :ref:`nn-init-doc`).

        Compared to ``torch.nn.Module.apply``, this version additionally gathers
        the full parameters before applying ``fn``. It should not be called from
        within another ``summon_full_params`` context.

        Args:
            fn (:class:`Module` -> None): function to be applied to each submodule

        Returns:
            Module: self
        NFTrM  	writeback)	r  _assert_staterR   rj   _summon_full_paramsr   applyr   _reset_lazy_init)r   r  Zuninitializedretr   r3  r]   r^   r  s  s    


zFullyShardedDataParallel.applyc                 C   s   | j jdk	S )z`
        Whether user explicitly enabled mixed precision for
        parameters or not.
        N)r   r_   r   r]   r]   r^   #_mixed_precision_enabled_for_params  s    z<FullyShardedDataParallel._mixed_precision_enabled_for_paramsc                 C   s   | j jdk	S )z]
        Whether user explicitly enabled mixed precision for
        buffers or not.
        N)r   ra   r   r]   r]   r^   $_mixed_precision_enabled_for_buffers  s    z=FullyShardedDataParallel._mixed_precision_enabled_for_buffersc                 C   s   | j jdk	S )zh
        Whether user explicitly enabled mixed precision for
        gradient reduction or not.
        N)r   r`   r   r]   r]   r^   #_mixed_precision_enabled_for_reduce  s    z<FullyShardedDataParallel._mixed_precision_enabled_for_reducec                 C   s   | j d k	o| j jS r   )r   rb   r   r]   r]   r^   )_mixed_precision_keep_low_precision_grads  s    
zBFullyShardedDataParallel._mixed_precision_keep_low_precision_gradsc                 C   s   | j dk	o| j tkS )zC
        Wether a low precision hook is registered or not.
        N)r  r   r   r]   r]   r^   _low_precision_hook_enabled  s    
z4FullyShardedDataParallel._low_precision_hook_enabled)rd   rR  kwargsr}   c              
      sL   t jt jd fdd}t  " t||t||fW  5 Q R  S Q R X dS )z
        Casts floating point tensors in ``args`` and ``kwargs`` to the
        precision given by ``dtype``, while respecting the existing
        ``requires_grad`` on the tensors.
        )xr}   c                    s*   t | s| S |  }| jr&| j|_|S r   )rc   is_floating_pointre  Zis_leafrequires_grad)r  yrd   r]   r^   cast_fn  s    

zBFullyShardedDataParallel._cast_fp_inputs_to_dtype.<locals>.cast_fnN)rc   Tensorr  r0   )r   rd   rR  r  r  r]   r  r^   _cast_fp_inputs_to_dtype  s
    

z1FullyShardedDataParallel._cast_fp_inputs_to_dtypeT)r   rd   memorM  r}   c                 C   s   |dkrt  }|  D ]}|| k	rDt|trD|rD|j||||d q||kr|| |jddD ]|\}}|dkrtqb|j|p| jd}|| j	kr|j
| j	|< t|r|r|j|| d}n|  r|| jj}t||| qbqdS )a  Move all buffers to the given *device* and *dtype*.
        If *device* is not given, then it will default to
        ``self.compute_device``, otherwise buffer will be moved to ``device``.
        In the case of nested FSDP instances, we will respect the child instance's
        ``compute_device`` configuration.
        If *dtype* is given, it must be a mapping of buffer name to buffer dtype,
            and this argument is currently only given to restore back to original
            buffer types during checkpoint. If *dtype* is not given, and we are
            in mixed precision training, the buffer will be cast to buffer_dtype,
            otherwise the buffer will not be cast.
        Args:
            device (torch.device, Optional):
                device to cast buffers to (defaults to compute_device)
            dtype: (Dict[str, torch.dtype], Optional):
                Mapping of buffer name to their dtype to cast to.
            memo (Set, Optional):
                set of modules that have already been processed
            recurse (bool, Optional):
                Whether to call _cast_buffers recursively on nested FSDP
                instances (default is True).
        N)r   rd   r  rM  FrL  rX  r  )r8  r6  r   rG   _cast_buffersrO  rN  re  r  r   rd   rc   r  r  r   ra   setattr)r   r   rd   r  rM  r   r  bufr]   r]   r^   r    s&    


z&FullyShardedDataParallel._cast_buffersc                 C   s$   d| _ | jD ]}t|dr|`qdS )zT
        Reset instance so :func:`_lazy_init` will run on the next forward.
        N_local_shard)r  r
  r>  r  )r   rD  r]   r]   r^   r    s    

z)FullyShardedDataParallel._reset_lazy_initc                 C   s"  | j dk	rdS tj s tdd| _ | tj |   | j	dd | j
D ]}| | qL| j| | j d}| | D ]}|| k	rz|j dks|j rtdd|_ | j|_| j|_|j| jkrd}| j|_| j|_| j|_| j|_|j
D ]}|| qqz|rtd| j d| j d	 dS )
a  
        Performs initialization lazily, typically right before the first
        forward pass. The laziness is needed to ensure that the parameter
        device/dtype and the FSDP hierarchy have finalized.

        This method's actual logic only runs on the root FSDP instance, which
        performs initialization for all non-root FSDP instances to avoid
        partial initialization.
        Nz(FSDP does not support CPU only executionTrL  FzcNon-root FSDP instance's `_is_root` should not have been set yet or should have been set to `False`zLFound inconsistent `limit_all_gathers` values across FSDP instances on rank z!. Using the root FSDP's value of z for all instances.)r  rc   r   is_availabler   r  rR   rj   _init_streamsr  r   _init_param_attributesr  r   r   r   r?  r  r   r  r  r  r   r   r   )r   r   Zinconsistent_limit_all_gathersr   r]   r]   r^   r    s@    




z#FullyShardedDataParallel._lazy_initc                 C   sh  |j }t|dr@| jjr<|jjtdks<td|jj dS | jjr`|jtdks`td|j|_| jjr|jjtdkst|j	 |_tj
|tdd	 |_|  rtj
|j| j| jjd|_t|j |jr^|  s|jn| jj}tj| | j | j|d|_|j |_t|j |  r^tj| | j | j|jd|_t|j d|_dS )	a  
        We manage several attributes on each Parameter instance.
        A few attributes are set here:
            ``_local_shard``: a single shard of the parameter. This is needed to
                recover the shard after rebuilding full parameter in forward
                and backward.
            ``_full_param_padded``: the full weight (padded to be evenly
                divisible by ``world_size``), used for computation in the
                forward and backward pass. It is initialized with the
                appropriate size and then has its storage freed. This will be
                resized in place and only materialized (via all-gather) as needed.
        Another attribute is set by :func:`_register_post_backward_hooks`:
            ``_post_backward_hook_state``: it holds the parameter's AccumulateGrad object
                and the registered post hook handle.
        r  r   z2Expected p._local_shard to be on CPU, but it's on NzExpected param to be on CPU when cpu_offloading is enabled. If CPU offloading is enabled correctly, you may be accidentally moving the model to CUDA after FSDP initialization.rX  )r   rd   F)r   r>  r   rg   r  r   rc   r?  dataZ
pin_memory
zeros_like	_cpu_gradr  r  r   r_   Z	_mp_shardr2   uses_sharded_strategyrd   r   numelr   Z_full_param_paddedr   Z_padded_unsharded_sizeZ_full_prec_full_param_padded_post_backward_called)r   r   rD  Zfull_param_dtyper]   r]   r^   r  M  s`    

 



z/FullyShardedDataParallel._init_param_attributesc                 C   sL   | j s
ttj sttj | jd< tj | jd< tj | jd< dS )zInitializes CUDA streams for overlapping data transfer and
        computation. This should only be called on the root FSDP instance.rv  post_backwardru  N)r  r?  rc   r   r  ZStreamr  r   r]   r]   r^   r    s
    
z&FullyShardedDataParallel._init_streamsc                 C   s8   | j s
dS tj }| jd | | jd | dS )z
        The root :class:`FullyShardedDataParallel` instance needs to
        synchronize with the default stream to ensure that the previous
        optimizer step is done.
        Nrv  ru  )r  rc   r   current_streamr  ry  )r   r  r]   r]   r^   _wait_for_previous_optim_step  s
    
z6FullyShardedDataParallel._wait_for_previous_optim_stepr   c                 C   s4   |sdS |  |}|D ]}| | d| j|< qdS )z
        Prefetches the next handles if needed (without synchronization). An
        empty handles key cannot prefetch.
        NT)_get_handles_to_prefetchrz  r  )r   r   Zhandles_to_prefetchr   r]   r]   r^   _prefetch_handles  s    

z*FullyShardedDataParallel._prefetch_handlesc                    s     |}tjtjtjf}t||kd| d|   j}g }|tjkrT jtjksj|tjkr jtjkr fdd|	|D }n(|tjkr j
r fdd||D }|S )ap  
        Returns a :class:`list` of the handles keys to prefetch for the next
        module(s), where ``current_handles_key`` represents the current module.

        "Prefetching" refers to running the unshard logic early (without
        synchronization), and the "next" modules depend on the recorded
        execution order and the current training state.
        z!Prefetching is only supported in z but currently in c                    s,   g | ]$} j |d r j|d s|qS F)r  r   r  r   Ztarget_handles_keyr   r]   r^   r    s   zEFullyShardedDataParallel._get_handles_to_prefetch.<locals>.<listcomp>c                    s,   g | ]$} j |d r j|d s|qS r  )r  r   r  r  r   r]   r^   r    s   )_get_training_stater:   rh   ri   rk   r5   r  r   rK   r   r   r   )r   r   r   Zvalid_training_statesZeodr   r]   r   r^   r    s<    





z1FullyShardedDataParallel._get_handles_to_prefetchr   c                 C   sH   t t|dkd tdd |D }t t|dkd|  tt|S )z=Returns the training state of the handles in ``handles_key``.r   zExpects a non-empty handles keyc                 s   s   | ]}|j V  qd S r   )_training_stater   r   r]   r]   r^   r     s     z?FullyShardedDataParallel._get_training_state.<locals>.<genexpr>r"   z'Expects uniform training state but got )r5   r   r8  rd  iter)r   r   Ztraining_statesr]   r]   r^   r    s    
z,FullyShardedDataParallel._get_training_state)r   state_dict_typestate_dict_configr}   c              	   c   s   d}d}|dkrt |  }t| D ]}|dkr6|j}|dkrD|j}||jkrVtdt|t|jkrptdt | }|t|krtd| dt| ||_||_q$z
dV  W 5 |dk	st|dk	stt| D ]}||_||_qX dS )a  
        A context manager to set the ``state_dict_type`` of all the descendant
        FSDP modules of the target module. The target module does not have to
        be a FSDP module. If the target module is a FSDP module, its
        ``state_dict_type`` will also be changed.

        .. note:: This API should be called for only the top-level (root)
            module.

        .. note:: This API enables users to transparently use the conventional
            ``state_dict`` API to take model checkpoints in cases where the
            root FSDP module is wrapped by another ``nn.Module``. For example,
            the following will ensure ``state_dict``  is called on all non-FSDP
            instances, while dispatching into `local_state_dict` implementation
            for FSDP:

        Example::

            >>> # xdoctest: +SKIP("undefined variables")
            >>> model = DDP(FSDP(...))
            >>> with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
            >>>     checkpoint = model.state_dict()

        Args:
            module (torch.nn.Module): Root module.
            state_dict_type (StateDictType): the desired ``state_dict_type`` to set.
        Nz0All FSDP module should the same state_dict_type.z@All FSDP modules should have the same type of state_dict_config.z#Expected state_dict_config of type 	 but got )_state_dict_type_to_configrG   r   r  r  r   r:  r?  )r   r  r  Zprev_state_dict_typeZprev_state_dict_configrA  Zexpected_state_dict_config_typer]   r]   r^   r  %  s:    "


z(FullyShardedDataParallel.state_dict_type)rW  r}   c                 C   sF   | t dd}| t d}|r.| d}| tj dd}|S )NrU    )replacer<   checkpoint_wrapperr   )r   rW  r]   r]   r^   _convert_to_wrapped_module_nameh  s    

 z8FullyShardedDataParallel._convert_to_wrapped_module_namec                 c   s<   | j j D ]*\}}| |}| | }|||fV  qd S r   )rT   r   Zparameter_module_namesr  )r   rs  rW  fqnr]   r]   r^   _param_fqnst  s
    


z$FullyShardedDataParallel._param_fqns)
state_dictrJ  r}   c                 C   s  t ||t d | | tjg |r0| jjs4|S t| jdrD|S | jj	}t
d}| jD ]\}}}| | }|}t|}	||	r|t|	d }||kstd| d|  d| d| d	| d
| j d|| jkr\t|| dds\z"||   ||< d|| _W q\ tk
rT }
 z$td| d| dt|
  W 5 d}
~
X Y q\X q\|r| jD ]V}|tj dd}| | }||krqd|| j|krd||  |||< qd|S )a)  
        Hook that runs after model.state_dict() is called before returning result to
        user. For FSDP, we may have to clone the tensors in state_dict as params go
        back to sharded version after _summon_full_params ends, and also remove
        "_fsdp_wrapped_module" prefix.
        rU   r   r   NzFSDP assumes z2 is in the state_dict but the state_dict only has z	. prefix=z, module_name=z param_name=z rank=_has_been_clonedFTz#Failed to clone() tensor with name z. This may mean that this state_dict entry could point to invalid memory regions after returning from state_dict() call if this parameter is managed by FSDP. Please check clone implementation of z	. Error: r  )!r   FSDP_WRAPPED_MODULEr  rR   rl   rT   r  r>  r  rp   rc   r   r  rS   
startswithr   r?  keysr   r   rl  cloneri  r  rb  r   r   r   r   r  r  r   re  )r   r  rJ  rp   rf  r  rs  rW  Z	clean_keyZclean_prefixrc  r]   r]   r^   r!  }  sN    


.


 
z3FullyShardedDataParallel._full_post_state_dict_hookc                 C   s   t || t d| | jjs"|S t| jtd}|dk	s<t|j }| | j	 }| |j
 }|dkr|j
dkr|dd|}t||g| j	g}t||| jd|| t < |S )z
        This hook create a ShardedTensor from the local flat_param and replace
        the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy
        will happen. The underlying storage is the same.
        rU   Nr   r   )r   r  rT   r  rl  r;   r?  Z_unpadded_unsharded_sizer  r   _shard_numel_paddednarrowr   Zfrom_tensor_and_offsetsr   r   )r   r  rJ  r   Z
full_numelZshard_offsetZvalid_data_sizelocal_shardsr]   r]   r^   r"    s$    

  z4FullyShardedDataParallel._local_post_state_dict_hookc              	   C   s   t || t d| | jjs"|S | jtjks6td| jdddX | j	D ]J\}}}t
t|d| j}t|| j| jtj | jd|| | < qLW 5 Q R X || t  |S )z
        The hook replaces the unflattened, unsharded parameter in the state_dict
        with a unflattened, sharded parameter (a ShardedTensor).
        rU   z]Inside _sharded_post_load_state_dict_hook, the training_state must not be SUMMON_FULL_PARAMS.Fr  )r   r   r   Znum_devices_per_nodeZpg)r   r  rT   r  r   rR   rl   r?  r  r  rU  reducerl  splitr   r-   r   r   rc   r   Zdevice_countr   r~  r;   )r   r  rJ  r  r   rH  r]   r]   r^   r#    s$    
z6FullyShardedDataParallel._sharded_post_state_dict_hook)r   r  rJ  rR  r}   c                 G   s:   t t| }|j|j ||}|jr6| r6|jdd |S )z
        _post_state_dict_hook() is called after the state_dict() of this
        FSDP module is executed. ``self._state_dict_type`` is used to decide
        what postprocessing will be done.
        TrL  )r   rG   r$  r  r  r  r  )r   r  rJ  rR  r   Zprocessed_state_dictr]   r]   r^   r   	  s    
z.FullyShardedDataParallel._post_state_dict_hookc              	      s   t j rt j  |   | jtjkr| jdk	r8| jnt	 }|j
}|j}| jtjkrh| jdd||dnt }|2 | jr|  r| j| jdd t j||}W 5 Q R X |r| jdkr|S i S nX| jtjks| jtjkr
| jjdk	r| jjjstdt j||S t d| j ddS )	aC  
        This is the entry point of all three FSDP ``state_dict`` APIs: full,
        local, and sharded. For the full state dict
        (``StateDictType.FULL_STATE_DICT``), FSDP attempts to unshard the model
        on all ranks, which may result in an OOM error if the full model cannot
        fit on a single GPU. In that case, users may pass in a
        :class:`FullStateDictConfig` to only save the checkpoint on rank 0 and/
        or to offload it to CPU memory layer by layer, enabling much larger
        checkpoints. If the full model cannot fit in CPU memory, then users may
        instead take a local state dict (``StateDictType.LOCAL_STATE_DICT``)
        that only saves the local shard of the model. The sharded state dict
        (``StateDictType.SHARDED_STATE_DICT``) saves the model parameters as
        ``ShardedTensor`` s. The ``state_dict`` type can be configured using
        the :meth:`state_dict_type` context manager.

        Example::

            >>> # xdoctest: +SKIP("undefined variables")
            >>> import torch
            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
            >>> from torch.distributed.fsdp import StateDictType
            >>> torch.cuda.set_device(device_id)
            >>> my_module = nn.Linear(...)
            >>> sharded_module = FSDP(my_module)
            >>> full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
            >>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT, full_state_dict_config):
            >>>     full_dict = sharded_module.state_dict()
            >>> full_dict.keys()
            >>> odict_keys(['weight', 'bias'])
            >>> # using local state dict
            >>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
            >>>     local_dict = sharded_module.state_dict()
            >>> local_dict.keys()
            >>> odict_keys(['flat_param', 'inner.flat_param'])

        .. warning:: This needs to be called on all ranks, since synchronization
            primitives may be used.
        NF)rM  r  rp   rq   )rd   rM  r   z_sharded_state_dict/local_state_dict can only be called when parameters are flatten and sharded.zUnknown StateDictType rU   )!rc   r   r  rw  r  r  rL   rm   r  rN   rq   rp   r   rR   rl   r  
contextlibsuppressr  r  r  r   r   r  r   rn   ro   rT   r   r   r  r   r=  )r   rR  r  Zfull_state_dict_configrq   rp   Z
summon_ctxr  r3  r]   r^   r  '	  sX    )


    


z#FullyShardedDataParallel.state_dict)rR  r  r}   c              
   O   s2   |  | tj | j||W  5 Q R  S Q R X dS )z
        Returns the local state of the module. Parameters are flattened and
        sharded, so the resulting state_dict can only be loaded after the module
        has been wrapped with FSDP.
        N)r  rL   rn   r  r   rR  r  r]   r]   r^   _local_state_dict	  s    z*FullyShardedDataParallel._local_state_dictc                 O   s<   |  tjg t| dd d k	s"t| jd d d  d | _d S )N_full_param_ctx)r  rR   rl   rl  r?  r  __exit__r  r]   r]   r^   r+  	  s    z8FullyShardedDataParallel._full_post_load_state_dict_hookc              
   O   s6   |  tj  | j| f||W  5 Q R  S Q R X dS )a  
        Returns the sharded states of the module. Parameters are unflattened and
        sharded, so the resulting state_dict can be used with any parallelism
        (e.g., DPP, model parallelism, and single trainer) after a valid
        resharding.
        N)set_state_dict_typerL   ro   r  r  r]   r]   r^   _sharded_state_dict	  s    z,FullyShardedDataParallel._sharded_state_dictc                 C   sH   t | dd d kst| jddd| _| j  t|||t d  d S )Nr  FTr  rU   )rl  r?  r  r  	__enter__r   r  )r   r  rJ  r]   r]   r^   r&  	  s     
z7FullyShardedDataParallel._full_pre_load_state_dict_hookc                 O   s   d S r   r]   r  r]   r]   r^   r,  	  s    z9FullyShardedDataParallel._local_post_load_state_dict_hookc                 C   s  t ||| t d | t dt }||krNt| jtddksJtddS || }t|tshtd| }t	|stdt
tj|d j}| jj}|dk	st|jd| fkr| | k std|  d|  dt|d|jg}|||< dS )	z
        This hook finds the local flat_param for this FSDP module from the
        state_dict. The flat_param should be a ShardedTensor. This hook converts
        the ShardedTensor to a tensor. No copy happen unless padding is required.
        rU   NzTNo flat parameter in state_dict but self._fsdp_wrapped_module.flat_param is not Nonez4Tensors in local_state_dict should be ShardedTensor.z9load_local_state_dict assume one shard per ShardedTensor.r   zLocal shard size = z% and the tensor in the state_dict is )r   r  r;   rl  rT   r?  r   r   r  r   r   rc   r  r   r   r  r  Fpad)r   r  rJ  r  Zload_tensorshardsr   r]   r]   r^   r'  	  s2    
 z8FullyShardedDataParallel._local_pre_load_state_dict_hookc                 O   s   d S r   r]   r  r]   r]   r^   r-  	  s    z;FullyShardedDataParallel._sharded_post_load_state_dict_hookc                 C   s2  t |||t d  | jjs"dS | jjjs4tdg }| jjjjD ]L\}}}| 	|}| t d| | }|
|}t|\}}	t|	dk stdt|	 d| j |  }
| d }t|| j |
 | }|	r0ttj|	d j }|js| }||  }|dkrDt|d|g}ntj||jd }tj|| j |jd }t j!||| j"d	 |#dd|
$| }|%| qD| jj}t&j'|d
d}t&(|| j| j\}}|)|j* | | kstd|  d|  d|j+|ks td| d|j+ d||| d< dS )z
        The hook combines the unflattened, sharded parameters (ShardedTensor) to
        a new FlatParameter and shards the new FlatParameter to the local chunk.
        rU   NzSload_sharded_state_dict can only be called when parameters are flatten and sharded.r   z&Expects 0 or 1 shard per rank but got z shards on rank r   r  r   F)r  z+The loaded local chunk has different numel(z) from the local chunk z-The loaded local chunk has different padding(z_fsdp_wrapped_module.flat_param),r   r  rT   r  r   r  r   r   Z_param_infosr  r~  r.   r   r?  r   r   r  mathceilr   r   rc   r  r   flattenZis_cudar   r  r  r   rd   emptyr   r   r   r  Zreshaper   r7   Zflatten_paramsZ
_get_shardre  r   r  )r   r  rJ  Znonsharded_tensorsrs  r   rW  r  rH  r  Zparam_numelZ
dim_0_size
chunk_sizeZlocal_tensorZnum_paddingr   r   Zloaded_flat_paramZ
num_to_padr]   r]   r^   r(  	  sd    	



 
  z:FullyShardedDataParallel._sharded_pre_load_state_dict_hookc                 G   s4   t t| }tj rtj  |j|j || dS )z
        ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()``
        is called. ``self._state_dict_type`` is used to decide what preprocessing
        will be done.
        N)r   rG   rc   r   r  rw  r)  r  )r   r  rJ  rR  r   r]   r]   r^   r%  (
  s    


z2FullyShardedDataParallel._pre_load_state_dict_hook)r   rR  r}   c                 G   s   t t| }|j|j   d S r   )r   rG   r.  r  )r   rR  r   r]   r]   r^   r*  ;
  s    
z3FullyShardedDataParallel._post_load_state_dict_hook)r  r}   c                    s   t  j|f| S )a  
        The entry point of all three FSDP ``load_state_dict`` APIs. By default,
        calling ``load_state_dict`` on an FSDP module will result in FSDP
        attempting to load a "full" state_dict, i.e. a state_dict consisting of
        full, unsharded, unflattened original module parameters. This requires
        FSDP to load the full parameter context on each rank which could result
        in GPU OOM. As a result, :func:`state_dict_type` API is available to
        configure between ``load_state_dict`` implementations. User can thus use
        ``with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT)`` context
        manager to load a local state dict checkpoint that will restore only
        local shards of the module. Currently, the only supported
        implementations are ``StateDictType.LOCAL_STATE_DICT`` and
        ``StateDictType.FULL_STATE_DICT`` (default). Please see :func:`state_dict`
        for documentation around creating an FSDP checkpoint.

        Example::

            >>> # xdoctest: +SKIP("undefined variables")
            >>> import torch
            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
            >>> from torch.distributed.fsdp import StateDictType
            >>> torch.cuda.set_device(device_id)
            >>> my_module = nn.Linear(...)
            >>> sharded_module = FSDP(my_module)
            >>> checkpoint = torch.load(PATH)
            >>> full_state_dict = checkpoint['full_state_dict']
            >>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT):
            >>>     sharded_module.load_state_dict(full_state_dict)
            >>> full_dict.keys()
            >>> odict_keys(['weight', 'bias'])
            >>> # using local state dict
            >>> local_state_dict = checkpoint['local_state_dict']
            >>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
            >>>     sharded_module.load_state_dict(local_state_dict)
            >>> local_dict.keys()
            >>> odict_keys(['flat_param', 'inner.flat_param'])

        .. warning:: This needs to be called on all ranks, since synchronization
            primitives may be used.
        )r   load_state_dict)r   r  rR  r  r3  r]   r^   r  C
  s    .z(FullyShardedDataParallel.load_state_dictc              
   G   s6   |  | tj | j|f| W  5 Q R  S Q R X dS )zI
        Load states from a flattened, sharded state dictionary.
        N)r  rL   rn   r  )r   r  rR  r]   r]   r^   _load_local_state_dicts
  s    z/FullyShardedDataParallel._load_local_state_dictzOrderedDict[str, torch.Tensor])r  strictr}   c              
   C   s0   |  tj | ||W  5 Q R  S Q R X dS )zK
        Load states from a unflattened, sharded state dictionary.
        N)r  rL   ro   r  )r   r  r  r]   r]   r^   _load_sharded_state_dict~
  s    z1FullyShardedDataParallel._load_sharded_state_dictc           	   
      s   t jjd     j||\}}d}tj j j	d} fdd j	D }t j
 j	|}  j	|||  j	D ]*}t|jj jkd j d|jj  q| j||}  j	||||W  5 Q R  S Q R X dS )z
        Runs the forward pass for the wrapped module, inserting FSDP-specific
        pre- and post-forward sharding logic.
         FullyShardedDataParallel.forwardNr   c                    s"   g | ]} j  o|jjtjkqS r]   )r  _configr   r9   rZ   r  r   r]   r^   r  
  s   z4FullyShardedDataParallel.forward.<locals>.<listcomp>z5Expected `FlatParameter` to be on the compute device r  )rc   autogradprofilerrecord_functionr  _fsdp_root_pre_forwardrU  rV  _pre_forward_unshardr   r  _pre_forwardr5   r   r   r  rT   _post_forward)	r   rR  r  unused
unshard_fnr{  
reshard_fnr   outputr]   r   r^   forward
  s*    

r  )r   r  r   inputc                 C   sF   t j| _| j|| j |D ]}tj|_q|dk	r8|  | | dS )a  
        Runs the pre-forward logic. This includes an opportunity to unshard
        currently sharded parameters such as those for the current forward and
        registering post-backward hooks for these current parameters.

        Args:
            handles (List[FlatParamHandle]): Handles giving the parameters
                used in the current forward.
            unshard_fn (Optional[Callable]): A callable to unshard any
                currently sharded parameters or ``None`` to not do any
                unsharding.
            module (nn.Module): Unused; expected by the hook signature.
            input (Any): Unused; expected by the hook signature.
        N)	rR   rk   r   r  r   Ztrainingr:   r  _register_post_backward_hooks)r   r   r  r   r  r   r]   r]   r^   r  
  s    
z%FullyShardedDataParallel._pre_forwardc                 C   sD   |r@|  | t|}d| j|< tj | jd  | | dS )z'Unshards parameters in the pre-forward.Frv  N)	rz  r   r  rc   r   r  ry  r  r  )r   r   r   r]   r]   r^   r  
  s    

z-FullyShardedDataParallel._pre_forward_unshard)r   r  r   r  r  r}   c                 C   sD   | j | |dk	r|  | ||}tj| _|D ]}tj|_q2|S )a  
        Runs the post-forward logic. This includes an opportunity to reshard
        currently unsharded parameters such as those used in the current
        forward and registering pre-backward hooks on the forward outputs.

        Args:
            handles (List[FlatParamHandle]): Handles giving the parameters
                used in the current forward.
            reshard_fn (Optional[Callable]): A callable to reshard any
                currently unsharded parameters (e.g. from the current forward)
                or ``None`` to not do any resharding.
            module (nn.Module): Unused; expected by the hook signature.
            input (Any): Unused; exepcted by the hook signature.
            output (Any): Forward pass output; pre-backward hooks are
                registered on the tensors that require gradients in this
                output.

        Postcondition: Each ``FlatParameter`` 's data points to the sharded
        flattened parameter.
        N)r  r   _register_pre_backward_hooksrR   rj   r   r:   r  )r   r   r  r   r  r  r   r]   r]   r^   r  
  s    
z&FullyShardedDataParallel._post_forwardc                 O   sT   t ||| jjd\}}|d }|d }|  rL| jj}| j|f||\}}||fS )ziMoves the forward inputs to the compute device and casts them to the
        appropriate dtype if needed.Fr   )r    r  r   r  r   r_   r  )r   rR  r  Zinput_dtyper]   r]   r^   _cast_forward_inputs
  s    z-FullyShardedDataParallel._cast_forward_inputsc                 O   sl   t | jdk	d | js||fS | jrL| | D ]}t|j}|r.d| j|< q.|   | j||\}}||fS )am  
        Runs pre-forward logic specific to the root FSDP instance, which should
        run before any individual module's pre-forward. This includes
        synchronizing with the previous iteration and casting the forward
        inputs appropriately. If this is called on a non-root FSDP instance,
        then the forward inputs are returned directly.
        Nz$Expects a root FSDP to have been setT)	r5   r  r   r   r   r   r  r  r  )r   rR  r  r   r   r]   r]   r^   r    s    
z/FullyShardedDataParallel._fsdp_root_pre_forward)rM  r  rq   rp   r}   c              
   c   sN   t j| dd}t .}|D ]} || j||||d qdV  W 5 Q R X dS )a   A context manager to expose full params for FSDP instances.
        Can be useful *after* forward/backward for a model to get
        the params for additional processing or checking. It can take a non-FSDP
        module and will summon full params for all contained FSDP modules as
        well as their children, depending on the ``recurse`` argument.

        .. note:: This can be used on inner FSDPs.
        .. note:: This can *not* be used within a forward or backward pass. Nor
            can forward and backward be started from within this context.
        .. note:: Parameters will revert to their local shards after the context
            manager exits, storage behavior is the same as forward.
        .. note:: The full parameters can be modified, but only the portion
            corresponding to the local param shard will persist after the
            context manager exits (unless ``writeback=False``, in which case
            changes will be discarded). In the case where FSDP does not shard
            the parameters, currently only when ``world_size == 1``, or ``NO_SHARD``
            config, the modification is persisted regardless of ``writeback``.
        .. note:: This method works on modules which are not FSDP themselves but
            may contain multiple independent FSDP units. In that case, the given
            arguments will apply to all contained FSDP units.

        .. warning:: Note that ``rank0_only=True`` in conjunction with
            ``writeback=True`` is not currently supported and will raise an
            error. This is because model parameter shapes would be different
            across ranks within the context, and writing to them can lead to
            inconsistency across ranks when the context is exited.

        .. warning:: Note that ``offload_to_cpu`` and ``rank0_only=False`` will
            result in full parameters being redundantly copied to CPU memory for
            GPUs that reside on the same machine, which may incur the risk of
            CPU OOM. It is recommended to use ``offload_to_cpu`` with
            ``rank0_only=True``.

        Args:
            recurse (bool, Optional): recursively summon all params for nested
                FSDP instances (default: True).
            writeback (bool, Optional): if ``False``, modifications to params are
                discarded after the context manager exits;
                disabling this can be slightly more efficient (default: True)
            rank0_only (bool, Optional): if ``True``, full parameters are
                materialized on only global rank 0. This means that within the
                context, only rank 0 will have full parameters and the other
                ranks will have sharded parameters. Note that setting
                ``rank0_only=True`` with ``writeback=True`` is not supported,
                as model parameter shapes will be different across ranks
                within the context, and writing to them can lead to
                inconsistency across ranks when the context is exited.
            offload_to_cpu (bool, Optional): If ``True``, full parameters are
                offloaded to CPU. Note that this offloading currently only
                occurs if the parameter is sharded (which is only not the case
                for world_size = 1 or ``NO_SHARD`` config). It is recommended
                to use ``offload_to_cpu`` with ``rank0_only=True`` to avoid
                redundant copies of model parameters being offloaded to the same CPU memory.
        Tr  rM  r  rq   rp   N)rG   r   r  	ExitStackenter_contextr  )r   rM  r  rq   rp   Zroot_fsdp_modulesstackr]   r]   r^   summon_full_params"  s     A 
	z+FullyShardedDataParallel.summon_full_paramsr  c           	      c   s  |r|rt d|r"|s"td |rlt 4}| | D ]}||jd|||d q:d V  W 5 Q R X d S tj	
  |   | tjg | jD ]}|jtjkstqtj| _| jD ]}tj|_qdd | jD }| | j tj	 | jd  |rD| jdkrD| | j| z
d V  W 5 tj| _| jD ]}tj|_q0X nt }| jD ]"}|rT|jrT||  qT|| j  z
d V  W 5 |  |r|  | j | | j| tj| _| jD ]}tj|_qX W 5 Q R X d S )	Nzwriteback=True and rank0_only=True is not supported, as model parameter shapes will be different across ranks, and writing to them can lead to inconsistencies across ranks when the context is exited.zoffload_to_cpu and rank0_only=False will result in full parameters being redundantly copied to CPU memory for GPUs that reside on the same machine, which may incur the risk of CPU OOM. It is recommended to use ``offload_to_cpu`` with rank0_only=True.Fr  c                 S   s   g | ]}|  qS r]   )Zneeds_unshardr  r]   r]   r^   r    s     z@FullyShardedDataParallel._summon_full_params.<locals>.<listcomp>rv  r   )!r=  r   r   r  r  r   r  r  rc   r   rw  r  r  rR   rj   r   r  r:   r?  rl   r   rz  r  ry  r  r   r  r  Zto_cpurT   Zunflatten_as_paramsclose_write_back_to_local_shard)	r   rM  r  rq   rp   r  r   r   r{  r]   r]   r^   r  v  sj    










z,FullyShardedDataParallel._summon_full_paramsr  c                 C   sf   |D ]\}|j sq|jjdks.td|jj t|j|j|j\}}|jj	d|
  | qdS )a  
        For each handle, writes back the this rank's shard of the unsharded
        flattened parameter to the sharded flattened parameter.

        Precondition: Each handle's ``FlatParameter`` 's data points to the
        padded unsharded flattened parameter.
        r"   z-Expects `flat_param` to be flattened but got N)r  r   ndimr?  shaper7   Z_get_unpadded_shardr   r   r  r  copy_)r   r   r   r  r   r]   r]   r^   r    s    	
z3FullyShardedDataParallel._write_back_to_local_shardc                 /   sB   | j tjk}t j||D ]"\}}|r2|td}||fV  qdS )z
        Overrides :meth:`named_buffers()` to intercept buffer names and
        remove all occurrences of the FSDP-specific flattened buffer prefix
        when inside the :meth:`summon_full_params` context manager.
        r  N)r   rR   rl   r   rN  r  FSDP_PREFIX)r   rR  r  in_summon_full_paramsrP  ro  r3  r]   r^   rN    s
    
z&FullyShardedDataParallel.named_buffersc                 /   sB   | j tjk}t j||D ]"\}}|r2|td}||fV  qdS )z
        Overrides :meth:`named_parameters()` to intercept parameter names and
        remove all occurrences of the FSDP-specific flattened parameter prefix
        when inside the :meth:`summon_full_params` context manager.
        r  N)r   rR   rl   r   rr  r  r  )r   rR  r  r  rs  rH  r3  r]   r^   rr    s
    z)FullyShardedDataParallel.named_parameters)outputsr   r}   c                    sz   t  s|S jrd_tr8dj< dj< tt t	ddfdd t j
t j
d fdd}t||S )	aT  
        Registers pre-backward hooks on the tensors that require gradients in
        the forward pass outputs ``outputs``, which were computed using the
        ``FlatParameter`` s of ``handles``.

        Returns:
            Forward pass outputs with pre-backward hooks registered to tensors
            that require gradients.
        FN)r   r  r}   c              	      s   t | }|r j|drdS tjjd  jrD jsD 	  n|rV 
tjg tj _|spW 5 Q R  dS | D ]}tj|_qt |  tj  jd  d j|<  | | D ]}|  qd j|< W 5 Q R X dS )zRPrepares ``_handles`` 's ``FlatParameter`` s for gradient
            computation.FNz+FullyShardedDataParallel._pre_backward_hookrv  T)r   r  r   rc   r  r  r  r  _post_backward_callback_queued_queue_wait_for_post_backwardr  rR   rj   rh   r   r:   r  rz  r   r  ry  r  r  r  Zprepare_gradient)r   r  Z_handles_keyr   r   r]   r^   _pre_backward_hook  s,    





zQFullyShardedDataParallel._register_pre_backward_hooks.<locals>._pre_backward_hook)tr}   c                    s&   | j r"| t  dj< | S )NT)r  register_hookrU  rV  r  )r  r  r   r   r   r]   r^   _register_hookH  s    
zMFullyShardedDataParallel._register_pre_backward_hooks.<locals>._register_hook)rc   is_grad_enabledr  r  r   r  r  r   r7   r   r  r0   )r   r  r   r  r]   r
  r^   r    s    

)z5FullyShardedDataParallel._register_pre_backward_hooksc                 C   s~   t  sdS |D ]h}|j}t|d}|s|js0q||}t|jdk	d |jjd d }|	t
| j|}||f|_qdS )aC  
        Registers post-backward hooks on the ``FlatParameter`` s'
        ``AccumulateGrad`` objects to reshard and to reduce-scatter gradients.

        The ``AccumulateGrad`` object represents the last function that
        finalizes the ``FlatParameter`` 's gradient, so it only runs after its
        entire gradient computation has finished.

        We register the post-backward hook only once in the *first* forward
        that a ``FlatParameter`` participates in. This relies on the
        ``AccumulateGrad`` object being preserved through multiple forwards.
        N_post_backward_hook_statezZThe `grad_fn` is needed to access the `AccumulateGrad` and register the post-backward hookr   )rc   r  r   r>  r  Z	expand_asr5   Zgrad_fnZnext_functionsr	  rU  rV  _post_backward_hookr  )r   r   r   r   Zalready_registeredZtemp_flat_paramZacc_gradZhook_handler]   r]   r^   r  P  s"    


z6FullyShardedDataParallel._register_post_backward_hooks)r   r  r}   c              
   G   s  |j }d|_tjjd | tjtj	g tj	| _
tj	|_|  rZ| jrZ| j| |jdkrrW 5 Q R  dS |jjrtd| |}| |g|g |f}| | | jsW 5 Q R  dS | jd tj  tj| jd  |jj}|  r|  s|jj | j!j"|j_| j#j$rHt%| j&dk	d t%| j'dk	d |jj}|j(rFd|_t)|}t*|+| j,}	| j,|	d -  |-  }
t./|d|
g}t0|	d }| &| j'|| | 1|| t2|d	}|r8t%|j3j4|j4kd
|j3j4 d|j4  t%|j3j5|j5kd|j3j5 d|j5  | j3|7  _3n||_3|j3}n6| j6t7j8krd| &| j'|j | 9 s|| 1|j| |j:j;r|j<j=|> dd |j?tj  |?| jd  W 5 Q R X W 5 Q R X dS )a  
        Reduce-scatters the gradient of ``handle`` 's ``FlatParameter``.

        Precondition: The ``FlatParameter`` 's ``.grad`` attribute contains the
        unsharded gradient for the local batch.

        Postcondition:
        - If using ``NO_SHARD``, then the ``.grad`` attribute is the reduced
        unsharded gradient.
        - Otherwise, the ``_saved_grad_shard`` attribute is the reduced sharded
        gradient (accumulating with any existing gradient).
        T,FullyShardedDataParallel._post_backward_hookNz;FSDP only works with gradients that don't require gradientsr  z%Communication hook should not be Nonez+Communication hook state should not be Noner   _saved_grad_shardz@Shape mismatch when accumulating gradients: existing grad shape=z new grad shape=zBDevice mismatch when accumulating gradients: existing grad device=z new grad device=)Znon_blocking)@r   r  rc   r  r  r  r  rR   rh   ri   r   r:   r  _use_param_exec_order_policy_param_exec_order_prep_stage_fsdp_params_exec_orderr   gradr  r   !_should_free_unsharded_flat_paramr  r  r  r  ry  r   r  rx  r  r  r  re  r   r`   r  r   r5   r  r  r  r  r  chunkr   r  r  r  r  _cast_grad_to_param_dtyper>  r  r   r   r   rH   r\   r  r  rg   r  r  ri  record_stream)r   r   r  rH  r  r   Zorig_grad_datar  Zgrad_flattenchunksZnum_padZinput_flattenedr  Zaccumulate_gradr]   r]   r^   r  v  s    







 r  )r  rH  c                 C   sP   |  tj |  sL|  s$|  rL|j}|jj|jd|_|	t
j  dS )a!  
        Casts gradient ``grad`` back to the full parameter dtype so that the
        optimizer step runs with that dtype. This performs an actual cast if
        1. parameters were in reduced precision during the forward since then
        gradients would be in that reduced precision, or
        2. parameters were not in reduced precision but gradients were in
        reduced precision for communication.
        However, if a low precision communication hook is registered, then this
        dtype cast happens in the hook instead.
        r  N)r  rR   ri   r  r  r  r  re  rd   r  rc   r   r  )r   r  rH  Zlow_prec_grad_datar]   r]   r^   r    s    z2FullyShardedDataParallel._cast_grad_to_param_dtype)r   c                 C   s   | j r|jp|jjtjkS r   )r  r  r  r   r9   rZ   rt  r]   r]   r^   r  :  s    z:FullyShardedDataParallel._should_free_unsharded_flat_paramc                 C   s<   t | jd | jrdS | tjg d| _tj| j	 dS )z
        Queues a post-backward callback from the root FSDP instance, which
        should happen at the beginning of its pre-backward.
        zL`_queue_wait_for_post_backward()` should be called on the root FSDP instanceNT)
r5   r  r  r  rR   rj   r   Z_execution_engineZqueue_callback_wait_for_post_backwardr   r]   r]   r^   r  @  s    z6FullyShardedDataParallel._queue_wait_for_post_backwardc                    s    j std jr@tj  jd   jj	r@tj 
   j  tdd fdd}tdd fdd}  D ]P}|| || |j  tj|_|jD ]}tj|_q|j  |j r|d	 _q|  r jr   dS )
z?Wait for post-backward to finish. Only called on root instance.z3_wait_for_post_backward can only be called on root.r  N)r   r}   c              
      s   zZg }g }| j D ]:}|j |jj k}|r0q| | || q || W nD tk
r } z&tdd|  dt	| dd |W 5 d}~X Y nX dS )a  
            Reshards full parameters that may have not been resharded in
            post_backward_hook. This can happen when an FSDP module's output
            is used in forward so its pre-backward fires unsharding the param,
            but post-backward does not fire since the output was not ultimately
            used in loss computation so FSDP parameter did not get a gradient.
            Fz&Got exception while resharding module rg  )Zraise_assertion_errorN)
r   r   Zdata_ptrr  r   r  r  	Exceptionr5   r   )r   r{  Zhandles_to_reshardr   Zalready_reshardedrc  r   r]   r^   _catch_all_reshardd  s$    
zLFullyShardedDataParallel._wait_for_post_backward.<locals>._catch_all_reshardc                    s4  | j D ]&}|j}|jrt|drNtt|jdkd |jd   t|d  j	sVqt|drt|j
t
dkd|j
 d|j  |j|_nrt|d	rt|j
|jj
kd|j
 d
|jj
  |jr|j|_|  r|j| jj|j_nt|j p|j d t|d	rt|d	 tt|dd d|_qdS )z&Helper used below on all fsdp modules.r  r   z1p._post_backward_hook_state fields are not valid.r"   r  r   zDevice mismatch: p=z p._cpu_grad=r  z p._saved_grad_shard=zNAll sharded parameters that received a gradient should use `_saved_grad_shard`r  z7Expected flag _post_backward_called to be set on param.FN)r   r   r  r>  r5   r   r  removedelattrr  r   rc   r  r  r  r  r  re  r   r_   r  r  )r   r   rD  r   r]   r^   _finalize_params  sP    






zJFullyShardedDataParallel._wait_for_post_backward.<locals>._finalize_paramsF)r  r?  r  rc   r   r  ry  r  r   rg   rw  r  r   rG   r   r  r   rR   rj   r   r   r:   r  r  r  r  r  )_param_exec_order_policy_second_iter_init)r   r  r  rC  r   r]   r   r^   r  O  s(    
"<



z0FullyShardedDataParallel._wait_for_post_backwardc                 C   s^   d| _ | j  |  D ]@}|| k	rt|trt|ds@tdt|dsRtdd|_ qd S )NF_param_exec_order_policyzINon-root FSDP modules should also have _param_exec_order_policy attributer  zMNon-root FSDP modules should also have _param_exec_order_prep_stage attribute)r  r  reverser6  r   rG   r>  r?  )r   rC  r]   r]   r^   r     s"    
  zBFullyShardedDataParallel._param_exec_order_policy_second_iter_init)stater}   c                 C   sf   t |tr|g}| j|krbd| d| j }| jdkrZtd|   td|  t  t|dS )z!Assert we are in the given state.zexpected to be in states z but current state is r   zAsserting FSDP instance is: zERROR: N)r   rR   r   r   print	tracebackprint_stackr=  )r   r#  msgr]   r]   r^   r    s    


z&FullyShardedDataParallel._assert_statec              	   c   s   |    | jstd| tj g }|  D ]$}t|tr.|	||j
f d|_
q.z
dV  W 5 |D ]\}}|j
rztd||_
qdX dS )a8  
        A context manager to disable gradient synchronizations across FSDP
        instances. Within this context, gradients will be accumulated in module
        variables, which will later be synchronized in the first
        forward-backward pass after exiting the context. This should only be
        used on the root FSDP instance and will recursively apply to all
        children FSDP instances.

        .. note:: This likely results in higher memory usage because FSDP will
            accumulate the full model gradients (instead of gradient shards)
            until the eventual sync.

        .. note:: When used with CPU offloading, the gradients will not be
            offloaded to CPU when inside the context manager. Instead, they
            will only be offloaded right after the eventual sync.
        z4`no_sync()` on inner FSDP instances is not supportedFzX`_sync_gradients` was incorrectly set to `True` while in the `no_sync()` context managerN)r  r  r?  r  rR   rj   r6  r   rG   r   r  )r   Z	old_flagsrC  Zold_flagr]   r]   r^   no_sync  s    

z FullyShardedDataParallel.no_syncc                 C   s   dd |   D S )z[
        Recursively returns a list of all module parameters that have a gradient.
        c                 S   s   g | ]}|j d k	r|qS r   r  r\  r]   r]   r^   r  *  s     
 z=FullyShardedDataParallel.params_with_grad.<locals>.<listcomp>)rB  r   r]   r]   r^   params_with_grad%  s    z)FullyShardedDataParallel.params_with_grad       @)max_norm	norm_typer}   c                 C   s   |    |   | jstd| tj t|}t|}t| j	|
 }|tjkrr|}tj|tjjj| jd n$|| }tj|| jd |d|  }| jr| }tj||j|jd|d  }|dk r| j	D ],}|jdk	st|j ||jj qdS )	a  
        Clip all gradients at this point in time. The norm is computed over all
        gradients together, as if they were concatenated into a single vector.
        Gradients are modified in-place.

        Args:
            max_norm (float or int): max norm of the gradients
            norm_type (float or int): type of the used p-norm. Can be ``'inf'``
                for infinity norm.

        Returns:
            Total norm of the parameters (viewed as a single vector).

        .. note:: This is analogous to ``torch.nn.utils.clip_grad_norm_`` but
            handles the partitioning and multiple devices per rank under the
            hood. The default torch util is not applicable here, because each
            rank only has a partial view of all the grads in the model, so
            calling it for FSDP models would lead to different scaling being
            applied per subset of model parameters.

        .. warning:: This needs to be called on all ranks, since synchronization
            primitives will be used.
        zBclip_grad_norm should only be called on the root (parent) instance)opr   r   g      ?r   gư>r"   N)r  r  r  r?  r  rR   rj   float_calc_grad_normr*  r   r  infr   Z
all_reducerc   distributedZReduceOpMAXr   r   r   r   rd   r   r  ri  Zmul_re  )r   r,  r-  
local_normZ
total_normZ	clip_coefrD  r]   r]   r^   clip_grad_norm_,  s(    

z(FullyShardedDataParallel.clip_grad_norm_c                 C   s   | d k	rt d d S )NzThe `optim_input` argument is deprecated and will be removed after PyTorch 1.13. You may remove it from your code without changing its functionality.)r   r   )optim_inputr]   r]   r^   _warn_optim_inputb  s    z*FullyShardedDataParallel._warn_optim_inputc                 C   s$   | d kr|d krdS | d k	r dS dS )NTFr]   )r6  optimr]   r]   r^   _is_using_optim_inputj  s
    z.FullyShardedDataParallel._is_using_optim_input)modelr8  r6  rq   r   r}   c              	   C   s,   t | t ||}t| |||d||dS )aD
  
        Consolidates the full optimizer state on rank 0 and returns it
        as a :class:`dict` following the convention of
        :meth:`torch.optim.Optimizer.state_dict`, i.e. with keys ``"state"``
        and ``"param_groups"``. The flattened parameters in ``FSDP`` modules
        contained in ``model`` are mapped back to their unflattened parameters.

        .. warning:: This needs to be called on all ranks since synchronization
            primitives are used. However, if ``rank0_only=True``, then the
            state dict is only populated on rank 0, and all other ranks return
            an empty :class:`dict`.

        .. warning:: Unlike ``torch.optim.Optimizer.state_dict()``, this method
            uses full parameter names as keys instead of parameter IDs.

        .. note:: Like in :meth:`torch.optim.Optimizer.state_dict`, the tensors
            contained in the optimizer state dict are not cloned, so there may
            be aliasing surprises. For best practices, consider saving the
            returned optimizer state dict immediately, e.g. using
            ``torch.save()``.

        Args:
            model (torch.nn.Module): Root module (which may or may not be a
                :class:`FullyShardedDataParallel` instance) whose parameters
                were passed into the optimizer ``optim``.
            optim (torch.optim.Optimizer): Optimizer for ``model`` 's
                parameters.
            optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
                Input passed into the optimizer ``optim`` representing either a
                :class:`list` of parameter groups or an iterable of parameters;
                if ``None``, then this method assumes the input was
                ``model.parameters()``. This argument is deprecated, and there
                is no need to pass it in anymore. (Default: ``None``)
            rank0_only (bool): If ``True``, saves the populated :class:`dict`
                only on rank 0; if ``False``, saves it on all ranks. (Default:
                ``True``)
            group (dist.ProcessGroup): Model's process group or ``None`` if using
                the default process group. (Default: ``None``)

        Returns:
            Dict[str, Any]: A :class:`dict` containing the optimizer state for
            ``model`` 's original unflattened parameters and including keys
            "state" and "param_groups" following the convention of
            :meth:`torch.optim.Optimizer.state_dict`. If ``rank0_only=True``,
            then nonzero ranks return an empty :class:`dict`.
        Fr:  r8  r6  rq   shard_stater   using_optim_inputrG   r7  r9  r*   )r:  r8  r6  rq   r   r=  r]   r]   r^   full_optim_state_dictu  s    8
 z.FullyShardedDataParallel.full_optim_state_dict)r:  r8  r6  r   r}   c              	   C   s,   t | t ||}t| ||dd||dS )a+  
        The API is similar to :meth:`full_optim_state_dict` but this API chunks
        all non-zero-dimension states to :class:`ShardedTensor` to save memory.
        This API should only be used when the model ``state_dict`` is derived
        with the context manager ``with state_dict_type(SHARDED_STATE_DICT):``.

        For the detailed usage, refer to :meth:`full_optim_state_dict`.

        .. warning:: The returned state dict contains ``ShardedTensor`` and
            cannot be directly used by the regular ``optim.load_state_dict``.
        FTr;  r>  )r:  r8  r6  r   r=  r]   r]   r^   sharded_optim_state_dict  s    
 z1FullyShardedDataParallel.sharded_optim_state_dict)r?  r:  r6  r8  r}   c                 C   s2   t | t ||}t| |d}t|||||S )a  
        Shards the full optimizer state dict ``full_optim_state_dict`` by
        remapping the state to flattened parameters instead of unflattened
        parameters and restricting to only this rank's part of the optimizer
        state. The first argument should be the return value of
        :meth:`full_optim_state_dict`.

        Example::

            >>> # xdoctest: +SKIP("undefined variables")
            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
            >>> model, optim = ...
            >>> full_osd = FSDP.full_optim_state_dict(model, optim)
            >>> torch.save(full_osd, PATH)
            >>> # Define new model with possibly different world size
            >>> new_model, new_optim = ...
            >>> full_osd = torch.load(PATH)
            >>> sharded_osd = FSDP.shard_full_optim_state_dict(full_osd, new_model)
            >>> new_optim.load_state_dict(sharded_osd)

        .. note:: Both :meth:`shard_full_optim_state_dict` and
            :meth:`scatter_full_optim_state_dict` may be used to get the
            sharded optimizer state dict to load. Assuming that the full
            optimizer state dict resides in CPU memory, the former requires
            each rank to have the full dict in CPU memory, where each rank
            individually shards the dict without any communication, while the
            latter requires only rank 0 to have the full dict in CPU memory,
            where rank 0 moves each shard to GPU memory (for NCCL) and
            communicates it to ranks appropriately. Hence, the former has
            higher aggregate CPU memory cost, while the latter has higher
            communication cost.

        Args:
            full_optim_state_dict (Dict[str, Any]): Optimizer state dict
                corresponding to the unflattened parameters and holding the
                full non-sharded optimizer state.
            model (torch.nn.Module): Root module (which may or may not be a
                :class:`FullyShardedDataParallel` instance) whose parameters
                correspond to the optimizer state in ``full_optim_state_dict``.
            optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
                Input passed into the optimizer representing either a
                :class:`list` of parameter groups or an iterable of parameters;
                if ``None``, then this method assumes the input was
                ``model.parameters()``. This argument is deprecated, and there
                is no need to pass it in anymore. (Default: ``None``)
            optim (Optional[torch.optim.Optimizer]): Optimizer that will load
                the state dict returned by this method. This is the preferred
                argument to use over ``optim_input``. (Default: ``None``)

        Returns:
            Dict[str, Any]: The full optimizer state dict now remapped to
            flattened parameters instead of unflattened parameters and
            restricted to only include this rank's part of the optimizer state.
        TrG   r7  r9  r%   r,   )r?  r:  r6  r8  r=  sharded_osdr]   r]   r^   shard_full_optim_state_dict  s"    A
       z4FullyShardedDataParallel.shard_full_optim_state_dict)r@  r:  r6  r8  r}   c                 C   s4   t | t ||}t| |dd}t|||||S )a  
        The API is similar to :meth:`shard_full_optim_state_dict`. The only
        difference is that the input ``sharded_optim_state_dict`` should be
        returned from :meth:`sharded_optim_state_dict`. Therefore, there will
        be all-gather calls on each rank to gather ``ShardedTensor`` s.

        Args:
            sharded_optim_state_dict (Dict[str, Any]): Optimizer state dict
                corresponding to the unflattened parameters and holding the
                sharded optimizer state.
            model (torch.nn.Module):
                Refer to :meth:``shard_full_optim_state_dict``.

        Returns:
            Refer to :meth:`shard_full_optim_state_dict`.
        Tr:  r<  rA  )r@  r:  r6  r8  r=  Zflattened_osdr]   r]   r^    flatten_sharded_optim_state_dict3  s"    
     z9FullyShardedDataParallel.flatten_sharded_optim_state_dict)r?  r:  r6  r8  r   r}   c                 C   s   t | t ||}|dkr.t|dr.|j}t|}t|}tj	|}t
j rbt
dnt
d}	|rt
j std|dkr| dkrtdt| |dd	}
t|
|}t|dkr|nd||}t||dkr|
nd||||	}t|||||}|S )
a  
        Scatters the full optimizer state dict from rank 0 to all other ranks,
        returning the sharded optimizer state dict on each rank. The return
        value is the same as :meth:`shard_full_optim_state_dict`, and on rank
        0, the first argument should be the return value of
        :meth:`full_optim_state_dict`.

        Example::

            >>> # xdoctest: +SKIP("undefined variables")
            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
            >>> model, optim = ...
            >>> full_osd = FSDP.full_optim_state_dict(model, optim)  # only non-empty on rank 0
            >>> # Define new model with possibly different world size
            >>> new_model, new_optim, new_group = ...
            >>> sharded_osd = FSDP.scatter_full_optim_state_dict(full_osd, new_model, group=new_group)
            >>> new_optim.load_state_dict(sharded_osd)

        .. note:: Both :meth:`shard_full_optim_state_dict` and
            :meth:`scatter_full_optim_state_dict` may be used to get the
            sharded optimizer state dict to load. Assuming that the full
            optimizer state dict resides in CPU memory, the former requires
            each rank to have the full dict in CPU memory, where each rank
            individually shards the dict without any communication, while the
            latter requires only rank 0 to have the full dict in CPU memory,
            where rank 0 moves each shard to GPU memory (for NCCL) and
            communicates it to ranks appropriately. Hence, the former has
            higher aggregate CPU memory cost, while the latter has higher
            communication cost.

        Args:
            full_optim_state_dict (Optional[Dict[str, Any]]): Optimizer state
                dict corresponding to the unflattened parameters and holding
                the full non-sharded optimizer state if on rank 0; the argument
                is ignored on nonzero ranks.
            model (torch.nn.Module): Root module (which may or may not be a
                :class:`FullyShardedDataParallel` instance) whose parameters
                correspond to the optimizer state in ``full_optim_state_dict``.
            optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
                Input passed into the optimizer representing either a
                :class:`list` of parameter groups or an iterable of parameters;
                if ``None``, then this method assumes the input was
                ``model.parameters()``. This argument is deprecated, and there
                is no need to pass it in anymore. (Default: ``None``)
            optim (Optional[torch.optim.Optimizer]): Optimizer that will load
                the state dict returned by this method. This is the preferred
                argument to use over ``optim_input``. (Default: ``None``)
            group (dist.ProcessGroup): Model's process group or ``None`` if
                using the default process group. (Default: ``None``)

        Returns:
            Dict[str, Any]: The full optimizer state dict now remapped to
            flattened parameters instead of unflattened parameters and
            restricted to only include this rank's part of the optimizer state.
        Nr   r   r   z#NCCL requires a GPU for collectivesr   z1Rank 0 must pass in the full optimizer state dictFrD  )rG   r7  r9  r>  r   r   Zget_rankZget_world_sizeZdistributed_c10dZ_check_for_nccl_backendrc   r   r  r   r   r=  r%   r+   r$   r#   r,   )r?  r:  r6  r8  r   r=  r   r   Z
using_ncclZbroadcast_deviceZflat_osdZprocessed_osdrB  r]   r]   r^   scatter_full_optim_state_dict^  sZ    A
 


          z6FullyShardedDataParallel.scatter_full_optim_state_dict)optim_state_dictoptim_state_key_typer:  r6  r8  r}   c                    s  t | t ||}|tjtjfks*t| }dd |d D }dd |d D }t|rbt|rrt|rt|sd|d 	  }	t
|	|tjkrt|s|tjkrt|r|S i }
|tjkrV|rt||nt|}t|fdd|D   fdd|d  D |
d< t|d	 |
d	< |
d	 D ]$}t fd
d|d D |d< q,|
S |tjkrt|}|rzt||nt|fdd| D fdd|d  D |
d< t|d	 |
d	< |
d	 D ]$}tfdd|d D |d< q|
S |
S )aO  
        Re-keys the optimizer state dict ``optim_state_dict`` to use the key
        type ``optim_state_key_type``. This can be used to achieve
        compatibility between optimizer state dicts from models with FSDP
        instances and ones without.

        To re-key an FSDP full optimizer state dict (i.e. from
        :meth:`full_optim_state_dict`) to use parameter IDs and be loadable to
        a non-wrapped model::

            >>> # xdoctest: +SKIP("undefined variables")
            >>> wrapped_model, wrapped_optim = ...
            >>> full_osd = FSDP.full_optim_state_dict(wrapped_model, wrapped_optim)
            >>> nonwrapped_model, nonwrapped_optim = ...
            >>> rekeyed_osd = FSDP.rekey_optim_state_dict(full_osd, OptimStateKeyType.PARAM_ID, nonwrapped_model)
            >>> nonwrapped_optim.load_state_dict(rekeyed_osd)

        To re-key a normal optimizer state dict from a non-wrapped model to be
        loadable to a wrapped model::

            >>> # xdoctest: +SKIP("undefined variables")
            >>> nonwrapped_model, nonwrapped_optim = ...
            >>> osd = nonwrapped_optim.state_dict()
            >>> rekeyed_osd = FSDP.rekey_optim_state_dict(osd, OptimStateKeyType.PARAM_NAME, nonwrapped_model)
            >>> wrapped_model, wrapped_optim = ...
            >>> sharded_osd = FSDP.shard_full_optim_state_dict(rekeyed_osd, wrapped_model)
            >>> wrapped_optim.load_state_dict(sharded_osd)

        Returns:
            Dict[str, Any]: The optimizer state dict re-keyed using the
            parameter keys specified by ``optim_state_key_type``.
        c                 S   s   g | ]}t |tkqS r]   )r:  r   r   Z	param_keyr]   r]   r^   r     s    zCFullyShardedDataParallel.rekey_optim_state_dict.<locals>.<listcomp>r#  c                 S   s   g | ]}t |tkqS r]   )r:  r   rI  r]   r]   r^   r    s    zInvalid parameter keys: c                    s   g | ]} | qS r]   r]   rY  )param_to_param_namer]   r^   r    s    c                    s   i | ]\}} | |qS r]   r]   )r   param_idparam_stateparam_id_to_param_namer]   r^   
<dictcomp>  s    zCFullyShardedDataParallel.rekey_optim_state_dict.<locals>.<dictcomp>Zparam_groupsc                    s   g | ]} | qS r]   r]   )r   rK  rM  r]   r^   r  $  s   r
  c                    s"   i | ]\}}| kr| | qS r]   r]   )r   rs  rH  )param_to_param_idr]   r^   rO  2  s    c                    s   i | ]\}} | |qS r]   r]   )r   rs  rL  param_name_to_param_idr]   r^   rO  7  s    c                    s   g | ]} | qS r]   r]   )r   rs  rQ  r]   r^   r  =  s   )rG   r7  r9  rQ   rs   rt   r?  r_  allr  r=  r'   r&   _get_param_to_param_nameitemscopydeepcopysorted_get_param_name_to_paramr)   r(   )rG  rH  r:  r6  r8  r=  ZosdZuses_param_name_maskZuses_param_id_mask	error_msgZnew_osdZparam_id_to_paramZparam_groupZparam_name_to_paramr]   )rN  rR  rP  rJ  r^   rekey_optim_state_dict  s    *
  









z/FullyShardedDataParallel.rekey_optim_state_dictc                 C   s   | j tjkrtjS tjS dS )zT
        Returns a default communication hook based on a sharding strategy.
        N)r   rH   r\   r   Zreduce_scatter_hookZallreduce_hookr   r]   r]   r^   r  D  s    z/FullyShardedDataParallel._get_default_comm_hookc                 C   s   t j| jdS )zZ
        Returns a default communication hook state based on a sharding strategy.
        r  )r   ZDefaultStater   r   r]   r]   r^   r  M  s    z5FullyShardedDataParallel._get_default_comm_hook_state)r#  hookc                 C   sf   |   std| | D ]F}|jr,tdd|_|j|  ksTtd|jj d||_||_qdS )aV
  
        Registers a communication hook which is an enhancement that provides a
        flexible hook to users where they can specify how FSDP aggregates gradients
        across multiple workers.
        This hook can be used to implement several algorithms like
        `GossipGrad <https://arxiv.org/abs/1803.05880>`_ and gradient compression
        which involve different communication strategies for
        parameter syncs while training with :class:`FullyShardedDataParallel`.

        .. warning ::
            FSDP communication hook should be registered before running an initial forward pass
            and only once.

        Args:
            state (object): Passed to the hook to maintain any state information during the training process.
                            Examples include error feedback in gradient compression,
                            peers to communicate with next in `GossipGrad <https://arxiv.org/abs/1803.05880>`_, etc.
                            It is locally stored by each worker
                            and shared by all the gradient tensors on the worker.
            hook (Callable): Callable, which has one of the following signatures:
                            1) ``hook: Callable[torch.Tensor] -> None``:
                            This function takes in a Python tensor, which represents
                            the full, flattened, unsharded gradient with respect to all variables
                            corresponding to the model this FSDP unit is wrapping
                            (that are not wrapped by other FSDP sub-units).
                            It then performs all necessary processing and returns ``None``;
                            2) ``hook: Callable[torch.Tensor, torch.Tensor] -> None``:
                            This function takes in two Python tensors, the first one represents
                            the full, flattened, unsharded gradient with respect to all variables
                            corresponding to the model this FSDP unit is wrapping
                            (that are not wrapped by other FSDP sub-units). The latter
                            represents a pre-sized tensor to store a chunk of a sharded gradient after
                            reduction.
                            In both cases, callable performs all necessary processing and returns ``None``.
                            Callables with signature 1 are expected to handle gradient communication for a `NO_SHARD` case.
                            Callables with signature 2 are expected to handle gradient communication for sharded cases.

        z9register_comm_hook can only be called on a root instance.z.communication hook can be only registered onceTz0communication hook should be default, but it is z insteadN)r  r?  r   r  r  r  rV   r  )r   r#  r\  rA  r]   r]   r^   register_comm_hookS  s    'z+FullyShardedDataParallel.register_comm_hookc                 O   s  |d }|d }t |dstts6|jd kstdnt|jtr|jj}t|}| D ]}t|t	rZtdqZt
|||dN z|||jj W n2 tk
r } ztd| dW 5 d }~X Y nX W 5 Q R X n|jd kstd	|j|d< | j|| d
| _d
| _g | _trt|jtrt }	| | D ]}
|
|	|
j< q4|jD ].}||	krL|	| jD ]}| j| qdqLd| _|  D ]4}|| k	rt|t	r| j|_| j|_| j|_qd S )Nr   r   tracing_configz:tracing_config should be None when torch.fx is not enabledzAThe input module of _patch_tracer should not contain FSDP modules)tracerr5  execution_infozNtracer.trace failed inside _init_param_exec_order_wrap_policy with the error: rU   zGtracing_config should either be an instance of TracingConfig or be NoneTF)r>  r?  _TORCH_FX_AVAILr^  r   rD   r_  rE   r6  rG   rF   traceZconcrete_argsrb  r   Zinit_policyr   r!  r  r  dictr   r   Zmodule_forward_orderr
  r   )r   rR  r  r   r   r_  r`  rC  rc  Zmodule_to_fsdpwrapr   r]   r]   r^   r     st     
"


z;FullyShardedDataParallel._init_param_exec_order_wrap_policyc                 C   s   t | do| jS )Nr!  )r>  r!  r   r]   r]   r^   r    s    
z5FullyShardedDataParallel._use_param_exec_order_policyc                 C   s8   t | do| j}|s4|  D ]}t |drtdq|S )Nr  Z_params_exec_order_hook_handlez]When not in execution order prep stage, all _params_exec_order_hook_handle should be removed.)r>  r  rB  r?  )r   Zis_prep_stagerD  r]   r]   r^   _is_param_exec_order_prep_stage  s    
z8FullyShardedDataParallel._is_param_exec_order_prep_stage)NNNNNNNNNFFF)F)NNNT)N)T)TTFF)TTFF)r+  )NTN)NN)NN)NN)NNN)NN)rV   rW   rX   rY   r;  r<  r   r   rH   rJ   r   rK   rI   r   rc   r   r   r   rf   r   r   r   r   r!   r   r   r   r	   r   r   r   r   r  r  r  r   r  r   r  r	  r7   r  r  rz  r  propertyr   r  r  r  staticmethodr   r  r  r  r  r  r  rd   r  r  r  r  r  r  r  r   r  r  r:   r  r  r   rL   rM   r
   r  r  r  r!  r"  r#  r   r  r  r+  r  r&  r,  r'  r-  r(  r%  r*  r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  rN  rr  r  r  r  r6   r  r  r  r  r   rR   r  r(  r*  r/  r5  r7  r9  r8  Z	Optimizerr   r?  r@  rC  rE  rF  rQ   r[  r  r  objectra  r]  r   r  re  __classcell__r]   r]   r3  r^   rG     s   Q             6-


+7/%  "  
  
    =9i7 A


P

#



a	

'
J
	
2
 #"'    R    OP& ) # 
 
5

   
E 
+ 


K 


*  

n 


v	2F)rB  rD  r}   c                    s   dd | D } t | dkr$tdS  tjkrHttdd | D }n"tjt fdd| D  }|j	| d j
d |S )	zCalculate gradient norm of an iterable of parameters.
    Returns:
        Total norm of the parameters (viewed as a single vector).
    c                 S   s   g | ]}|j d k	r|qS r   r)  r\  r]   r]   r^   r    s     
 z#_calc_grad_norm.<locals>.<listcomp>r   g        c                 s   s    | ]}|j    V  qd S r   )r  ri  absmaxr   parr]   r]   r^   r     s     z"_calc_grad_norm.<locals>.<genexpr>c                    s&   g | ]}t jj|j  t jd qS )r  )rc   linalgvector_normr  ri  Zfloat32rl  rD  r]   r^   r    s   r  )r   rc   r   r  r1  rk  rn  ro  r  re  rd   )rB  rD  r4  r]   rp  r^   r0    s    


	r0  )r:  rF  r}   c                    s&    fdd}dd }i }t | |||S )aN  
    Constructs a mapping from flattened parameter (including non-FSDP-module
    parameters) to its unflattened parameter names. For non-FSDP-module
    parameters, these mapped-to lists always contain a single element. The
    unflattened parameter names should match the keys of the model state dict.

    For shared parameters, only the first parameter name is included (following
    the ``torch.nn.Module.parameters()`` order).

    Args:
        model (torch.nn.Module): Root module (which may or may not be a
            :class:`FullyShardedDataParallel` instance).
        dedup_shared_params (bool): If ``True``, only includes the first
            list of unflattened parameter names corresponding to a parameter
            in the module walk order; if ``False``, then includes all of the
            unflattened parameter names.
    c                    sv   t | tsr| jddD ]Z\}}t|tkr0|jn|g} fdd|D }||k}|s^|||< qs|| | qd S )NFrL  c                    s   g | ]}t  | qS r]   )rS   )r   r  rJ  r]   r^   r    s   zG_get_param_to_unflat_param_names.<locals>.module_fn.<locals>.<listcomp>)r   rG   rr  r:  r6   Z_prefixed_param_namesrm  )r   rJ  rG  rs  rH  Zmodule_prefixed_param_namesZfully_prefixed_param_namesZis_shared_paramrE  rq  r^   rQ    s    


z3_get_param_to_unflat_param_names.<locals>.module_fnc                 S   s   | S r   r]   )rG  r]   r]   r^   rS  "  s    z3_get_param_to_unflat_param_names.<locals>.return_fn)r/   )r:  rF  rQ  rS  rG  r]   rE  r^   r     s       r   )r:  r}   c                 C   sd   t | }| D ]<}t|dks(tdt|dkrtdt| d| qdd | D }|S )a(  
    Constructs a mapping from parameters to their parameter names. ``model``
    should not contain any :class:`FullyShardedDataParallel` instances, which
    means that none of the parameters should be ``FlatParameter`` s. As a
    result, compared to :meth:`_get_param_to_unflat_param_names`, the mapped
    values may be flattened from singleton :class:`list` s to the contained
    names themselves.

    Args:
        model (torch.nn.Module): Root module, which should not contain any
            :class:`FullyShardedDataParallel` instances.
    r   zE`_get_param_to_unflat_param_names()` should not construct empty listsr"   z=Each parameter should only map to one parameter name but got rg  c                 S   s   i | ]\}}||d  qS )r   r]   )r   rH  r   r]   r]   r^   rO  C  s    z,_get_param_to_param_name.<locals>.<dictcomp>)r   valuesr   r?  r   rU  )r:  Zparam_to_param_namesr   rJ  r]   r]   r^   rT  +  s    rT  c                 C   s   t | }tt| | S )zCConstructs the inverse mapping of :meth:`_get_param_to_param_name`.)rT  rc  r|  rr  r  )r:  rJ  r]   r]   r^   rY  J  s    rY  )tensor_namer}   c                 C   s4   |  td d} |  td d} |  td d} | S )zPCleans the parameter or buffer name by removing any module wrapper
    prefixes.rU   r  )r  r  r<   r   )rs  r]   r]   r^   rS   R  s    )T)r   r  rV  rU  r   r  r%  r   r   Zdataclassesr   enumr   r   typingr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   rc   Ztorch.distributedr2  r   Z;torch.distributed.algorithms._checkpoint.checkpoint_wrapperZ
algorithmsZ_checkpointr  Ztorch.nnr;  Ztorch.nn.functionalZ
functionalr  Ztorch.autogradr   r   Z'torch.distributed._shard.sharded_tensorr   r   r   r   Z(torch.distributed.algorithms._comm_hooksr   r   Z"torch.distributed.distributed_c10dr   Ztorch.distributed.utilsr   r   r    Ztorch.nn.parameterr!   Z_optim_utilsr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   Z_fsdp_extensionsr-   r.   _utilsr/   r0   r1   r2   r3   r4   r5   r   r6   r7   r8   r9   r:   Zflatten_params_wrapperr;   r<   r=   rd  r>   r?   r@   rA   r`  Z
torchdistxrB   rC   ImportErrorra  r>  Z_symbolic_tracerD   rE   rF   __all__r  r  r   rn  rH   rI   rJ   rK   rR   rL   rM   rN   rO   rP   rm   rn   ro   r  rQ   r   ru   ry   r   r\   rZ   r[   r  r<  rG   r/  r  r0  rf   r   r   rT  rY  rS   r]   r]   r]   r^   <module>   s  D0$	
         $6	     ]                              z 4 