U
    Kc`4                     @   s  d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
Z
d dlZ
d dlmZmZmZmZ d dlmZ d d	lmZ d d
lmZmZmZ e
j rd dlmZmZmZ ndZedddZeddG dd dZ eddG dd dZ!dd Z"dd Z#eddedddZ$ddedddZ%G dd  d e
j&j'j(j)Z*G d!d" d"e
j&j+Z,G d#d$ d$e
j-j.Z/eddee0e0d%d&d'Z1ddedd(d)Z2dS )*    )deepcopy)	dataclass)	lru_cache)MappingProxyType)warnN)!_torch_dtype_to_nvfuser_dtype_mapgetnvFuserDtypeNumbernumber_type)GraphModule)CapabilityBasedPartitioner)tree_flattentree_maptree_unflatten)DataTypeFusionFusionDefinitionT)use_python_fusion_cacheallow_single_op_fusion)frozenc                   @   s.   e Zd ZU eed< eed< eed< eed< dS )nvFuserTensorTemplatesizestridedtypeis_cpuN)__name__
__module____qualname__tuple__annotations__r   bool r!   r!   A/tmp/pip-unpacked-wheel-gikjz4vx/torch/_prims/nvfuser_executor.pyr   (   s   
r   c                   @   s   e Zd ZU eed< dS )nvFuserScalarTemplater   N)r   r   r   r   r   r!   r!   r!   r"   r#   0   s   
r#   c                 C   s   dd }t || S )Nc                 S   sL   t | tjr*t|  |  t| j| jS t | t	rDt
tt| S | S d S N)
isinstancetorchTensorr   r   r   r   r   r   r	   r#   r
   argr!   r!   r"   
to_nvfuser6   s    
z,to_nvfuser_template_args.<locals>.to_nvfuser)r   )argsr*   r!   r!   r"   to_nvfuser_template_args5   s    r,   c                 C   s   t tdd | S )Nc                 S   s   t dd | jD S )Nc                 s   s&   | ]}t |tjjr|jd kV  qdS )Zget_attrN)r%   r&   fxNodeop).0ar!   r!   r"   	<genexpr>J   s     z7_any_get_attr_used.<locals>.<lambda>.<locals>.<genexpr>)anyr+   nr!   r!   r"   <lambda>J   s   z$_any_get_attr_used.<locals>.<lambda>)r3   filter)call_function_nodesr!   r!   r"   _any_get_attr_usedF   s    r9   i   )maxsize)gmc              	      sx  t j std| jjD ]N}|jdkr4d|jkr4q|jdkrt|j	dd d krt
d| d|j	 dqttdd	 | jj}ttd
d	 | jj}t|t|kstdt|dkstdt|dkstdt|rtdt }t|vfdd G  fdddt jj}fdd}tt||}|| j| }	t|	\}
}|
D ]}| qTW 5 Q R X ||fS )NzCAttempting to use nvFuser trace executor but CUDA is not available!call_functiongetitemimpl_nvfuserz@All call_function nodes in the graph must support nvfuser. Node z with target z does not support nvfuserc                 S   s
   | j dkS Nplaceholderr/   r4   r!   r!   r"   r6   g       z%make_nvfuser_fusion.<locals>.<lambda>c                 S   s
   | j dkS Nr<   rA   r4   r!   r!   r"   r6   i   rB   zBNumber of placeholder nodes in the graph must match number of argsr   z#There must be at least one argumentz2Graph must contain at least one call_function nodezXConstant tensors that are saved in the graph and used as arguments are not supported yetc                    s   t | tr | S | S d S r$   )r%   r	   Zdefine_constantr(   fdr!   r"   _to_nvfuser_constanty   s    

z1make_nvfuser_fusion.<locals>._to_nvfuser_constantc                       s.   e Zd Z fddZfddZ  ZS )z.make_nvfuser_fusion.<locals>.FusionInterpreterc                    s  |j tjjjtjjjjfkrvt|jd jd j	}t
|jdksDt| |\}}|d ||d g}| |j ||jS |j tjjjtjjjjfkr| |\}}t
|dkst|d }tt|dd  }|d d |f | }|j jf||S t |S )Nr   tensor_meta               )targetr&   opsnvprimsZsqueezedefaultlistr+   metashapelenAssertionErrorZfetch_args_kwargs_from_envr<   kwargsZnative_batch_normr   mapr>   superrun_node)selfnodeZoriginal_shaper+   rV   ZtrainingZ	args6_end)	__class__rF   rE   r!   r"   rY      s(    

z7make_nvfuser_fusion.<locals>.FusionInterpreter.run_nodec                    sP   dt |kr(t|d tst|||S tt |}|j}f| }|||S )Nr=   r   )strr%   r   rU   rW   r>   )rZ   rM   r+   rV   rF   rE   r!   r"   r<      s    

z<make_nvfuser_fusion.<locals>.FusionInterpreter.call_function)r   r   r   rY   r<   __classcell__r!   r^   r\   r"   FusionInterpreter   s   ra   c                    sH   t | tr& | j| j| j| j}|S t | tr@ | j}|S | S d S r$   )	r%   r   Zdefine_tensorr   r   r   r   r#   Zdefine_scalar)r)   xrD   r!   r"   templates_to_nvfuser_inputs   s    

z8make_nvfuser_fusion.<locals>.templates_to_nvfuser_inputs)r&   cudais_availableRuntimeErrorgraphnodesr/   namegetattrrM   
ValueErrorrQ   r7   rT   rU   r9   r   r   r-   Interpreterr   rW   runr   Z
add_output)r;   Znv_args_templatesr[   Zgraph_input_nodesr8   fusionra   rc   Znv_argsoutZflat_outunflatten_specor!   r^   r"   make_nvfuser_fusionS   sX    


$rr   )executor_parametersc          
      G   s   |pt }t|\}}tdd |D rtdd |D rt|}|dt d }|rht| f| \}}ntj| f| \}}tdd |D }	t	|
|	|S td | j| S d S )Nc                 s   s    | ]}t |tjo|jV  qd S r$   )r%   r&   r'   is_cudar0   r)   r!   r!   r"   r2      s     z"nvfuser_execute.<locals>.<genexpr>c                 s   s2   | ]*}t |tj p(|jr$|jd kp(|jV  qdS )r   N)r%   r&   r'   r   ndimrt   ru   r!   r!   r"   r2      s
   r   c                 s   s"   | ]}t |tjtfr|V  qd S r$   )r%   r&   r'   r	   ru   r!   r!   r"   r2      s     zJnvfuser_executor is executed with non-cuda args, fallback to aten executor)DEFAULT_NVFUSER_PYTHON_CONFIGr   r3   allr,   getrr   __wrapped__r   r   executer   Zforward)
r;   rs   r+   Z	flat_args_Znv_template_argsZ	use_cachern   rp   Zconcrete_fusion_inputsr!   r!   r"   nvfuser_execute   s0    r}   c                   @   s    e Zd ZejjedddZdS )NvfuserPrimOperatorSupport)r[   returnc                 C   sr   |j dkrL|jtjjjjkrLt|j	d d k	oJt|j	d j
d jd k	S |j dkrht|jdd d k	ppd|jkS )Nr<   rI   r   rG   r>   r=   )r/   rM   r&   rN   rO   Zconvert_element_typerP   r   ry   r+   rR   r   rj   ri   )rZ   Z
submodulesr[   r!   r!   r"   is_node_supported   s     
z,NvfuserPrimOperatorSupport.is_node_supportedN)r   r   r   r&   r-   r.   r    r   r!   r!   r!   r"   r~      s   r~   c                       s   e Zd Z fddZ  ZS )PartitionedInterpreterc                    sT   t |tstt|dkst| |}|dr@t|f| S t |||S d S )Nr   fused_)	r%   r]   rU   rT   Z
fetch_attr
startswithr}   rX   call_module)rZ   rM   r+   rV   Zsubmodr`   r!   r"   r      s    

z"PartitionedInterpreter.call_module)r   r   r   r   r_   r!   r!   r`   r"   r      s   r   c                       s$   e Zd Z fddZdd Z  ZS )NvfuserGraphModulec                    s   t    || _d|i| _d S )Nr   )rX   __init__r;   rs   )rZ   r;   r   r`   r!   r"   r     s    
zNvfuserGraphModule.__init__c                 G   s   t | jf|d| jiS )Nrs   )r}   r;   rs   )rZ   r+   r!   r!   r"   __call__	  s    zNvfuserGraphModule.__call__)r   r   r   r   r   r_   r!   r!   r`   r"   r     s   r   )r;   r   r   c           
         s  t   ttdd | jj}t fdd|D }|t|dkO }t|shtttdd | jjdkrp| dfS |rt| } t	|  |d}|
 }t|dkrtd	td
 ||}|jjD ]D}|jdkrd|jkrt||j}	||j | |jt|	| q||fS | |fS d S )Nc                 S   s
   | j dkS rC   rA   r4   r!   r!   r"   r6     rB   z'maybe_partition_graph.<locals>.<lambda>c                 3   s   | ]}  d | V  qd S r$   )r   )r0   r[   Zsupported_opsr!   r"   r2     s    z(maybe_partition_graph.<locals>.<genexpr>r   c                 S   s
   | j dkS r?   rA   r4   r!   r!   r"   r6   $  rB   T)Zallows_single_node_partitionzNo partition found for the graph. This is likely because the graph is not supported by nvFuser. Please use the eager ATen mode to execute the graph.)categoryr   r   )r~   rQ   r7   rg   rh   r3   rT   r9   r   r   Zpropose_partitionsr   RuntimeWarningZfuse_partitionsr/   ri   rj   Zdelete_submodulerM   Zadd_submoduler   )
r;   r   r   r8   Zany_unsupportedZpartitionerZ
partitionsZpartitioned_graphr[   Znvfuser_submoduler!   r   r"   maybe_partition_graph  sJ      
r   c                G   s^   |pt }|dt d }|dt d }t| ||d\} }|rF| | S t| f|d|iS d S )Nr   r   )r   r   rs   )rw   ry   r   r}   )r;   rs   r+   r   r   Zis_partitionedr!   r!   r"   nvfuser_execute_partitionedJ  s"    
r   )3copyr   Zdataclassesr   	functoolsr   typesr   warningsr   r&   Ztorch.overridesZtorch._prims_commonr   r   r	   r
   Ztorch.fxr   Z!torch.fx.passes.infra.partitionerr   Ztorch.utils._pytreer   r   r   rd   re   Ztorch._C._nvfuserr   r   r   rw   r   r#   r,   r9   rr   r}   r-   ZpassesZoperator_supportZOperatorSupportr~   rl   r   nnModuler   r    r   r   r!   r!   r!   r"   <module>   sH   
c+  9