U
    Jc                     @   s  U d Z ddlZddlZddlmZmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ g Zee ed< ee	jeef Zee
 Zee
ef Zeje	jeejddf dd	d
ZeddddZejeee e
f ee	jee dddZejeee e
f eee	jee dddZdS )zPer-layer profilers.    N)Any	GeneratorListUnionSequence)Tensor   )Batch__all__)moduledevicereturnc                 c   s0   | D ]&}t |}|| |  |V  qdS )zOCopies layers for ease to profile. It doesn't modify the given
    module.
    N)copydeepcopytoZtrain)r   r   layerZ
layer_copy r   T/tmp/pip-unpacked-wheel-gikjz4vx/torch/distributed/pipeline/sync/_balance/profile.pylayerwise_sandbox   s
    

r   )batchr   c                 C   s*   t | D ]\}}| |j| |< qdS )zDetaches from autograd graph.N)	enumeratedetachrequires_grad_requires_grad)r   ixr   r   r   r   &   s    r   )r   sampletimeoutr   r   c                    s2  t dd |  D rtdt|}t|D ]"\}}| ||j||< q.dd | D }t		 }t		 | |k r|}	tt
| |D ]\}}
t|	 |jdkrtj| t		 }|	|
}	tdd |	D }|rtj|| |jdkrtj| t		 }|| ||  qqhd  fd	d|D S )
z!Profiles elapsed times per layer.c                 s   s   | ]}|j d k	V  qd S N)Zgrad.0pr   r   r   	<genexpr>.   s     z profile_times.<locals>.<genexpr>z#some parameter already has gradientc                 S   s   g | ]}g qS r   r   )r    _r   r   r   
<listcomp>5   s     z!profile_times.<locals>.<listcomp>cudac                 s   s   | ]}|j r|V  qd S r   )r   )r    yr   r   r   r"   F   s      i@B c                    s"   g | ]}t  fd d|D qS )c                 3   s   | ]}t |  V  qd S r   )int)r    tusr   r   r"   Q   s     z+profile_times.<locals>.<listcomp>.<genexpr>)sum)r    bufr)   r   r   r$   Q   s     )any
parameters
ValueErrorr	   r   r   r   r   r   timer   typetorchr%   ZsynchronizecalltupleZautogradZbackwardappend)r   r   r   r   Z_batchr   r   Z	time_bufsZbegun_atr   r   ZtickZbackward_tensorsZtockr   r)   r   profile_times,   s0    


r6   )r   inputchunksparam_scaler   r   c                 C   s   |j dkrtdt|}g }|d d| }t|D ]*\}}	|	dd  ||	j||< q8t	| |D ]j}
t| t
j|}||
}t
j|}|| }tdd |
 D }|| ||  }|t| qn|S )z%Profiles CUDA memory usage per layer.r%   z'size profiler supports only CUDA devicer   N   c                 s   s   | ]}|   V  qd S r   )Zstoragenbytesr   r   r   r   r"   l   s     z profile_sizes.<locals>.<genexpr>)r1   r/   r	   sizer   r   r   r   r   r   r2   r%   Zmemory_allocatedr3   r+   r.   r5   r'   )r   r7   r8   r9   r   r   ZsizesZlatent_scaler   r   r   Zmemory_beforeZmemory_afterZlatent_sizeZ
param_sizer<   r   r   r   profile_sizesT   s"    
$
r=   )__doc__r   r0   typingr   r   r   r   r   r2   r   Ztorch.nnnnZ
microbatchr	   r
   str__annotations__r   r'   ZDeviceZTensorsZTensorOrTensorsZ
SequentialModuler   r   floatr6   r=   r   r   r   r   <module>   s*   "*)    