U
    KcF                     @   sl   d dl mZmZmZ d dlZd dlmZ d dlmZ d dlZddl	m
Z
mZ d dlZdgZG dd dZdS )	    )DictAnyListN)defaultdict)nn   )fqn_to_modulemodule_to_fqnActivationSparsifierc                   @   s   e Zd ZdZd,ejdddZedd Zdd	 Z	d-ejd
ddZ
d.eejdddZdd Zdd Zdd Zdd Zd/ddZdd Zd0ddZeeef dd d!Zeeef dd"d#d$Zeeef dd%d&Zeeef dd'd(d)Zd*d+ ZdS )1r
   aL  
    The Activation sparsifier class aims to sparsify/prune activations in a neural
    network. The idea is to attach the sparsifier to a layer (or layers) and it
    zeroes out the activations based on the mask_fn (or sparsification function)
    input by the user.
    The mask_fn is applied once all the inputs are aggregated and reduced i.e.
    mask = mask_fn(reduce_fn(aggregate_fn(activations)))

    Note::
        The sparsification mask is computed on the input **before it goes through the attached layer**.

    Args:
        model (nn.Module):
            The model whose layers will be sparsified. The layers that needs to be
            sparsified should be added separately using the register_layer() function
        aggregate_fn (Optional, Callable):
            default aggregate_fn that is used if not specified while registering the layer.
            specifies how inputs should be aggregated over time.
            The aggregate_fn should usually take 2 torch tensors and return the aggregated tensor.
            Example
                def add_agg_fn(tensor1, tensor2):  return tensor1 + tensor2
                reduce_fn (Optional, Callable):
                    default reduce_fn that is used if not specified while registering the layer.
                    reduce_fn will be called on the aggregated tensor i.e. the tensor obtained after
                    calling agg_fn() on all inputs.
                    Example
                def mean_reduce_fn(agg_tensor):    return agg_tensor.mean(dim=0)
                mask_fn (Optional, Callable):
                    default mask_fn that is used to create the sparsification mask using the tensor obtained after
                    calling the reduce_fn(). This is used by default if a custom one is passed in the
                    register_layer().
                    Note that the mask_fn() definition should contain the sparse arguments that is passed in sparse_config
                    arguments.
                features (Optional, list):
                    default selected features to sparsify.
                    If this is non-empty, then the mask_fn will be applied for each feature of the input.
                    For example,
                mask = [mask_fn(reduce_fn(aggregated_fn(input[feature])) for feature in features]
                feature_dim (Optional, int):
                    default dimension of input features. Again, features along this dim will be chosen
                    for sparsification.
                sparse_config (Dict):
                    Default configuration for the mask_fn. This config will be passed
                    with the mask_fn()

    Example:
        >>> # xdoctest: +SKIP
        >>> model = SomeModel()
        >>> act_sparsifier = ActivationSparsifier(...)  # init activation sparsifier
        >>> # Initialize aggregate_fn
        >>> def agg_fn(x, y):
        >>>     return x + y
        >>>
        >>> # Initialize reduce_fn
        >>> def reduce_fn(x):
        >>>     return torch.mean(x, dim=0)
        >>>
        >>> # Initialize mask_fn
        >>> def mask_fn(data):
        >>>     return torch.eye(data.shape).to(data.device)
        >>>
        >>>
        >>> act_sparsifier.register_layer(model.some_layer, aggregate_fn=agg_fn, reduce_fn=reduce_fn, mask_fn=mask_fn)
        >>>
        >>> # start training process
        >>> for _ in [...]:
        >>>     # epoch starts
        >>>         # model.forward(), compute_loss() and model.backwards()
        >>>     # epoch ends
        >>>     act_sparsifier.step()
        >>> # end training process
        >>> sparsifier.squash_mask()
    N)modelc                 K   sb   || _ t | _|| jd< || jd< || jd< || jd< || jd< || jd< tt| _tt| _d S )Nsparse_configaggregate_fn	reduce_fnmask_fnfeaturesfeature_dim)r   r   defaultsdictdata_groupsstate)selfr   r   r   r   r   r   r    r   o/tmp/pip-unpacked-wheel-gikjz4vx/torch/ao/sparsity/_experimental/activation_sparsifier/activation_sparsifier.py__init__V   s    






zActivationSparsifier.__init__c                 C   sZ   | d | d  }}|dk	r*|dk	s*t ddddg}|D ]}| | }t|s8t dq8dS )	zXMakes sure that some of the functions and attributes are not passed incorrectly
        r   r   Nz#need feature dim to select featuresr   r   r   zfunction should be callable)AssertionErrorcallable)argsr   r   Zfn_keyskeyfnr   r   r   _safe_rail_checksi   s    
z&ActivationSparsifier._safe_rail_checksc                    sH   j  d j  d j  d  dd fdd}|S )zMReturns hook that computes aggregate of activations passing through.
        r   r   r   Nreturnc           	         s,  |d }j  d}d krR|d krFt|}t|j d<  ||}n|d krdd tdtD }dd tdtD j d< n|}ttD ]r}t| g	 
|j}t||}|d krt|}t|j d |< n|| } ||||< q|j  d< d S )Nr   datamaskc                 S   s   g | ]}d qS r   r   .0_r   r   r   
<listcomp>   s     zFActivationSparsifier._aggregate_hook.<locals>.hook.<locals>.<listcomp>c                 S   s   g | ]}d qS r$   r   r%   r   r   r   r(      s     )r   gettorchZ
zeros_likeZ	ones_liker   rangelenTensorlongtodeviceindex_select)	moduleinput
input_datar"   Zout_datafeature_idxZfeature_tensordata_featureZ	curr_dataZagg_fnr   r   namer   r   r   hook   s(    
$

z2ActivationSparsifier._aggregate_hook.<locals>.hook)r   r   r8   r9   r   r7   r   _aggregate_hooky   s
    z$ActivationSparsifier._aggregate_hook)layerc                 K   s   t | j|}|dk	std|| jkr<td | j|d t| j	}	||||||d}
|	
dd |
 D  |	d 
| | |	 |	| j|< || j|d}d| j| d	< || j| d
< d| j| d< dS )a
  
        Registers a layer for sparsification. The layer should be part of self.model.
        Specifically, registers a pre-forward hook to the layer. The hook will apply the aggregate_fn
        and store the aggregated activations that is input over each step.

        Note::
            - There is no need to pass in the name of the layer as it is automatically computed as per
              the fqn convention.

            - All the functions (fn) passed as argument will be called at a dim, feature level.
        Nzlayer not found in the modelzalayer already attached to the sparsifier, deregistering the layer and registering with new config)r8   )r   r   r   r   r   r<   c                 s   s"   | ]\}}|d k	r||fV  qd S )Nr   )r&   argvalr   r   r   	<genexpr>   s      z6ActivationSparsifier.register_layer.<locals>.<genexpr>r   r#   r9   	aggregate
hook_state)r	   r   r   r   warningswarnunregister_layercopydeepcopyr   updateitemsr   register_forward_pre_hookr;   r   )r   r<   r   r   r   r   r   r   r8   Z
local_argsZupdate_dictZagg_hookr   r   r   register_layer   s*    



z#ActivationSparsifier.register_layer)r8   r<   c                 C   s   |dk	s|dk	st d|dkrH|dk	s,t t| j|}|dk	sHt d|| jkrZtd| j| dd}|dkr|td|S )a  
        Returns mask associated to the layer.

        The mask is
            - a torch tensor is features for that layer is None.
            - a list of torch tensors for each feature, otherwise

        Note::
            The shape of the mask is unknown until model.forward() is applied.
            Hence, if get_mask() is called before model.forward(), an
            error will be raised.
        Nz0Need at least name or layer obj to retrieve maskz&layer not found in the specified modelz*Error: layer with the given name not foundr#   zFError: shape unknown, call layer() routine at least once to infer mask)r   r	   r   r   
ValueErrorr)   )r   r8   r<   r#   r   r   r   get_mask   s    
zActivationSparsifier.get_maskc                 C   s.   | j | d   | j| | j | dS )z/Detaches the sparsifier from the layer
        r9   N)r   remover   pop)r   r8   r   r   r   rD      s    z%ActivationSparsifier.unregister_layerc              	   C   sR   t  @ | j D ].\}}|d }| ||| | j| d qW 5 Q R X dS )zCInternally calls the update_mask() function for each layer
        r"   N)r*   Zno_gradr   rH   update_maskrN   )r   r8   configsr"   r   r   r   step   s
    
zActivationSparsifier.stepc                 C   s~   |  |}|d }|d }|d }|d }|dkrJ||}||f||_n0tt|D ]"}	|||	 }
||
f|||	 _qVdS )a(  
        Called for each registered layer and does the following-
            1. apply reduce_fn on the aggregated activations
            2. use mask_fn to compute the sparsification mask

        Note:
            the reduce_fn and mask_fn is called for each feature, dim over the data
        r   r   r   r   N)rL   r"   r+   r,   )r   r8   r"   rP   r#   r   r   r   r   r5   r6   r   r   r   rO     s    	
z ActivationSparsifier.update_maskc                    s:   |  || j| d | j| d   fdd}|S )z[Returns hook that applies sparsification mask to input entering the attached layer
        r   r   c                    st   |d }d kr| S t dtD ]D}t| g |j}t| ||  }| || q&|S d S )Nr   )	r+   r,   r*   r-   r.   r/   r0   r1   Zindex_copy_)r2   r3   r4   r5   ZfeatureZ
sparsifiedr   r   r#   r   r   r9   !  s    z1ActivationSparsifier._sparsify_hook.<locals>.hook)rL   r   r:   r   rR   r   _sparsify_hook  s
    
z#ActivationSparsifier._sparsify_hookTc                 K   s`   | j  D ]P\}}|d   |d d| j | d< |rR|d | ||d< d|d< q
dS )z
        Unregisters aggreagate hook that was applied earlier and registers sparsification hooks if
        attach_sparsify_hook = True.
        r9   NonerA   r<   sparsifyN)r   rH   rM   rN   rI   rS   )r   Zattach_sparsify_hookkwargsr8   rP   r   r   r   squash_mask/  s    
z ActivationSparsifier.squash_maskc                 C   s8   t  }| j D ]"\}}dd | D }|||< q|S )a  Exclude hook and layer from the config keys before serializing

        TODO: Might have to treat functions (reduce_fn, mask_fn etc) in a different manner while serializing.
              For time-being, functions are treated the same way as other attributes
        c                 S   s   i | ]\}}|d kr||qS ))r9   r<   r   )r&   r   valuer   r   r   
<dictcomp>E  s       zFActivationSparsifier._get_serializable_data_groups.<locals>.<dictcomp>)r   r   rH   )r   r   r8   configZ
new_configr   r   r   _get_serializable_data_groups=  s
    
z2ActivationSparsifier._get_serializable_data_groupsc                 C   s   t |}| D ]\}}|d dk	rt|d trtt|d D ]:}|rf|d |  |d |< qD|d |  |d |< qDq|r|d  |d< q|d  |d< q|S )zConverts the mask to sparse coo or dense depending on the `sparse_coo` argument.
        If `sparse_coo=True`, then the mask is stored as sparse coo else dense tensor
        r#   N)	rE   rF   rH   
isinstancer   r+   r,   Zto_sparse_cooZto_dense)r   Zstates_dict
sparse_cooZstatesr'   r   idxr   r   r   _convert_maskI  s    
z"ActivationSparsifier._convert_maskr    c                 C   s"   |   }| | j}||| jdS )a0  Returns the state of the sparsifier as a :class:`dict`.

        It contains:
        * state - contains name -> mask mapping.
        * data_groups - a dictionary containing all config information for each
            layer
        * defaults - the default config while creating the constructor
        r   r   r   r[   r_   r   r   r   r   r   r   r   r   
state_dict]  s    	zActivationSparsifier.state_dict)rc   r!   c                 C   s0   |d }|d |d  }}|  |||d dS )zThe load_state_dict() restores the state of the sparsifier based on the state_dict

        Args:
        * state_dict - the dictionary that to which the current sparsifier needs to be restored to
        r   r   r   r`   N)__set_state__)r   rc   r   r   r   r   r   r   load_state_dictn  s    z$ActivationSparsifier.load_state_dictc                 C   s"   |   }| | j}| j||dS )N)r   r   r   ra   rb   r   r   r   __get_state__y  s    z"ActivationSparsifier.__get_state__)r   r!   c                 C   s   | j |d dd|d< | j| | j D ]z\}}t| j|}|d k	sLtd|krr|d dkrr|| 	|}n$d|kr|d dkr|| 
|}||d< ||d< q,d S )	Nr   F)r]   rA   r@   rU   r<   r9   )r_   __dict__rG   r   rH   r   r   r   rI   r;   rS   )r   r   r8   rZ   r<   r9   r   r   r   rd     s    z"ActivationSparsifier.__set_state__c                 C   s   | j jd }| j D ]^\}}|d7 }|d7 }|d| d7 }t| D ](}|dkrXqJ|d| d||  d7 }qJq|d7 }|S )	Nz (
z	Data Group
z	    name: )r"   r9   r   r   r   z	    z: ))	__class____name__r   rH   sortedkeys)r   format_stringr8   rZ   r   r   r   r   __repr__  s    zActivationSparsifier.__repr__)NNNNN)NNNNN)NN)T)T)rk   
__module____qualname____doc__r   Moduler   staticmethodr   r;   rJ   strrL   rD   rQ   rO   rS   rW   r[   r_   r   r   rc   re   rf   rd   ro   r   r   r   r   r
      s4   I    
)      .



)typingr   r   r   r*   collectionsr   r   rE   Zsparsifier.utilsr   r	   rB   __all__r
   r   r   r   r   <module>   s   