U
    Kc,                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZ ddlmZmZmZ ddlmZ dd	lmZmZmZ d
d Zedd Z eej!e
dddZ"ed*ddZ#dd Z$eej!e
dddZ%edd Z&d+ddZ'ej(j)Z)e)j*e)j+e)j,e)j-e)j.e)j/e)j0e)j1e)j2e)j3e)j4e)j5e)j6e)j7j8e)j7j9e)j:e)j;e)j<e)j=e)j>e)j?e)j@hZAeeAZAedd ZBd,ee
ejCf eeeD  ddd ZEd!d" ZFd aGd#d$ ZHd%d& ZId-d(d)ZJdS ).    N)contextmanager)partial)CallableOptionalTupleUnion)get_decompositions   )aot_function
aot_modulemake_boxed_compiler)strip_overloads)default_partition
draw_graph#min_cut_rematerialization_partitionc                 C   s6   | j jD ] }|jtjjjkrtjjj|_q|   | S N)	graphnodestargettorchopsaten_to_copyto	recompile)fx_gnode r   </tmp/pip-unpacked-wheel-gikjz4vx/functorch/_src/compilers.py_canonicalize   s
    r   c               	   c   s*   t jd} z
d V  W 5 t j|  X d S )NF)r   _CZ_jit_set_autocast_mode)Zold_jit_autocast_flagr   r   r   _disable_jit_autocast"   s    
r!   )r   returnc              	   C   s  t   t|  | jjD ]F}|jtjjjkrt	|j
dkrt	|jdkrd|jkrtjjj|_q| jjD ]<}i }|j D ]"\}}t|tjr|j}|||< qz||_qh| j  |   tj| }tj|j tj| }tj|}||  W 5 Q R X |S )a  
    Compiles the :attr:`fx_g` with Torchscript compiler.

    .. warning::
        This API is experimental and likely to change.

    Args:
        fx_g(fx.GraphModule): The input Fx graph module to be compiled.

    Returns:
        Torch scripted model.
    r	   dtype)r!   r   r   r   r   r   r   r   r   lenargskwargsr   items
isinstancedevicetypeZlintr   jitscriptr    Z_jit_pass_remove_mutationfreezeevalZoptimize_for_inference)r   inpsr   Z
new_kwargskvfr   r   r   
ts_compile+   s6    

r3   Tc                 C   s   t | j t| ||d | S )N)
clear_meta)printcoder   )r   _namer4   r   r   r   _draw_graph_compile\   s    
r9   c                 C   s   t t| dS )Nr8   )r   r9   r:   r   r   r   draw_graph_compilec   s    r;   c                 C   s   | S )z
    Returns the :attr:`fx_g` Fx graph module as it is. This is a no-op compiler
    and can be used to check accuracy.

    .. warning::
        This API is experimental and likely to change.

    r   r   r7   r   r   r   nopg   s    
r=   c                 C   s(   t |  tj| }tj| }|S r   )r   r   r+   r,   r-   r.   )r   r7   r2   r   r   r   simple_ts_compilet   s    r>   c                 C   s   t | t|dS )N)static_argnums)r
   r>   )r2   r?   r   r   r   nnc_jit|   s    r@   c                 C   s   t | j | S r   )r5   r6   r<   r   r   r   print_compile   s    
rA   )fnr?   c                 K   sD   t t tt|d}|| t| tjjr4t| f|S t	| f|S dS )a  
    Wrapper function over :func:`aot_function` and :func:`aot_module` to perform
    memory efficient fusion. It uses the
    :func:`min_cut_rematerialization_partition` partitioner to perform efficient
    recomputation. It uses NVFuser to compile the generated forward and backward
    graphs.

    .. warning::
        This API is experimental and likely to change.

    Args:
        fn (Union[Callable, nn.Module]): A Python function or a ``nn.Module``
            that takes one ore more arguments. Must return one or more Tensors.
        static_argnums (Optional[Tuple[Int]]): An option tuple of ints to mark
            the arguments of the function as static.
        **kwargs: Any other overrides you want to make to the settings

    Returns:
        Returns a ``Callable``  or ``nn.Module`` that retains the eager behavior
        of the original :attr:`fn`, but whose forward and backward graphs have
        gone through recomputation optimizations, and the graphs have been
        compiled with nvfuser.

    )fw_compilerbw_compilerpartition_fndecompositionsr?   N)
r3   r   default_decompositionsupdater(   r   nnModuler   r
   )rB   r?   r&   configr   r   r   memory_efficient_fusion   s    
rL   c                 C   sH   |  d tddd |D  d ddlm} |  |  t| |S )NfooaQ  
##############################################################
# To minimize FX graph, copy and paste the below and run it  #
##############################################################

import torch
import torch.fx as fx
from functorch.compile import minifier, check_nvfuser_subprocess, check_nvfuser_correctness_subprocess

inps = c                 S   s   g | ]}|j |jfqS r   )shaper#   ).0ir   r   r   
<listcomp>   s     z!debug_compile.<locals>.<listcomp>a?  
inps = [torch.ones(shape, dtype=dtype, device='cuda') for (shape, dtype) in inps]
from foo import FxModule
mod = FxModule().cuda()

with torch.jit.fuser("fuser2"):
  # check_nvfuser_subprocess can be replaced with check_nvfuser_correctness_subprocess
  minifier(fx.symbolic_trace(mod), inps, check_nvfuser_subprocess)
r   )FxModule)	to_folderr5   rM   rR   Zcudar3   )r   r/   rR   r   r   r   debug_compile   s    
	rT   c                 C   s   g }t | d}t|}g }|D ]}t|dkrD|}|t }nX|\}}}}	}
|	tjtjtj	tj
tjtjtthkrtjdd||	|
d}ntj||	|
d}|| q"W 5 Q R X |S )zZ
    Return a random input for the given inputs meta generated from _save_fx_default.
    rbr	   r   )r#   r)   )openpickleloadr$   randomZrandr   intZint32Zint64boolZuint8floatrandintappend)Zinput_data_pathinputsr2   Zinputs_metametar*   inputrN   strider#   r)   r   r   r   
get_inputs   s.    

rc   c           	         s`   ddl m} fdd fddfdd}fd	d
}fdd}|||||tdS )aO  
    The forward, backward, and joint computation graph will be stored in
    {folder_name}/{current_name}/{current_name}_forward_{graph_index},
    {folder_name}/{current_name}/{current_name}_backward_{graph_index}, and
    {folder_name}/{current_name}/{current_name}_joint_{graph_index} respectively.
    The input shape of the graphs will be stored in the .input files.
    These files can be loaded with pickle,
    and is a list of format (type, shape, stride, dtype, device).
    In the case of type = int or float, it is just (type,).
    For joint graph input, it is a nested list [[],[]]
    where the two inner lists have the same format.
    If dump_example_input is True, example_inputs will be stored in .pt file.
    Since each function might produce multiple graphs,
    the graph_index is used to distinguish difference graphs
    r   )aot_module_simplifiedc                    s   g }t | dkrBt| d trB| | d 7 }| | d 7 }|S | D ]P}t|tksbt|tkrt|t|f qF|t||j| |j	|j
f qF|S )Nr   r	   )r$   r(   tupler*   rZ   r\   r^   rN   rb   r#   r)   )r%   
input_metaarg)get_input_metar   r   rh     s    z(_save_fx_default.<locals>.get_input_metac                    s@  t | jjdkr6ttjd  d| dt d d S t| }|j	t
jj  |  |}tj d  }|st d   | d  d  d| dt 	 t|t d  d  d| dt d  d| dt dd r<t
| d  d  d| dt d  d| dt d d S )	Nr   zNo nodes in graph r7   ./z.inputwbz.pt)r$   r   r   logginglogWARNINGgraph_indexcopydeepcopyZset_codegenr   fxZCodeGenr   ospathexistsmakedirsrS   rW   dumprV   save)Z
gm_to_saver%   	type_namegmrf   ZisExist)current_namedump_example_inputfolder_namerh   r   r   graph_saver_helper.  s8    
22z,_save_fx_default.<locals>.graph_saver_helperc                    s    | |d | S )NZforwardr   )rz   Zfw_argsr~   r   r   graph_saver_forwardP  s    z-_save_fx_default.<locals>.graph_saver_forwardc                    s    | |d t d7 a | S )NZbackwardr	   )ro   )rz   Zbw_argsr   r   r   graph_saver_backwardT  s    z._save_fx_default.<locals>.graph_saver_backwardc                    s    | |d t | |S )NZjoint)r   )rz   Z
joint_argsr   r   r   graph_saver_jointZ  s    z+_save_fx_default.<locals>.graph_saver_joint)rC   rD   rE   rF   )Zfunctorch.compilerd   rG   )	r{   r}   r|   rz   Zexample_inputsrd   r   r   r   r   )r{   r|   r}   rh   r~   r   _save_fx_default  s    "r   Fc                 C   s   da tt| ||S )as  
    Dump the forward, backward, and joint computation graph.
    Example Usage:
    save_fx_func = graph_dumper_aot(current_name, folder_name, dump_example_input = False)
    optimize_ctx = torchdynamo.optimize(
        save_fx_func
    )
    with torch.enable_grad():
        with optimize_ctx:
            result = forward_and_backward_pass(model, example_inputs)
    r   )ro   r   r   )r{   r}   r|   r   r   r   graph_dumper_aotg  s    r   )T)N)N)F)Krp   rl   rs   rW   rY   
contextlibr   	functoolsr   typingr   r   r   r   r   Ztorch.fxrr   Ztorch.nnrI   Ztorch._decompr   Zaot_autogradr
   r   r   Zcompile_utilsr   Zpartitionersr   r   r   r   r!   ZGraphModuler3   r9   r;   r=   r>   r@   r   r   detachZgelu_backwardZleaky_relu_backwardZsigmoid_backwardZthreshold_backwardZhardtanh_backwardZhardsigmoid_backwardZhardswish_backwardZtanh_backwardZsilu_backwardZelu_backwardZcudnn_batch_normZcudnn_batch_norm_backwardZmasked_fillZScalarZTensorZeluZ
leaky_reluZhardtanhZ	hardswishZhardsigmoidZconj_physicalZis_same_sizerG   rA   rJ   rZ   rL   rT   ro   rc   r   r   r   r   r   r   <module>   s   	
0


 
+Z