
    `i'                        d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZmZmZmZ ddlm Z  ddl!m"Z" de
j#        j$        de%e&         fdZ'de
j#        j(        de)e
j*        e
j#        j+        f         fdZ,de
j#        j(        de&de	e-         fdZ.de
j#        j(        de&de	e-         fdZ/de
j#        j(        de&fdZ0de
j#        j(        de1e	e-                  fdZ2de
j#        j(        dee         defdZ3 G d d          Z4 e"d e4                       	 	 d'd!ed"ef         d#ee         d$e5d%e5ded"ee         f         f
d&Z6dS )(a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)Sequence)AnyCallableOptional)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendgreturnc           	      n   dt           t          t          f         dt          fd}t          t                    }d}t	                      }| j        D ]c}|j        dk    rvt           ||j                  t          j
                  rH|t           ||j                                                                                         |           |dz  }|j        dk    rt          |j        d          s|j        j        }t#          |j                  D ]\  }}|t'          |j                  k     r|j        |         }	n!|j        |j        vr:|j        |j                 }	d	}
|j        r|j        j        rd
}
|
r8||t           ||	j                                                                     z  }e|S )Nmetar   c                 *    d| v r| d         n| d         S )Nvalfake_result )r   s    u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fkz%find_input_mutations.<locals>.meta_fk7   s    #tmmtE{{m1DD    r   placeholderr   call_function_schemaFT)dictstrr   r   setnodesop
isinstancer   torchTensorr   _typed_storageaddhasattrtargetr&   	enumerate	argumentslenargsnamekwargs
alias_infois_write)r   r"   inputs	input_idxmutated_inputsnschemaiargargumentmut_args              r!   find_input_mutationsrD   6   s   Ed38n E E E E E FIUUNW  4=  ''!&//5<88 X~ggafoo&D&D&F&FGGHLLYWWWNIIT_$$18Y// X%F#F$455  3s16{{?? vayHHxqx//  x1H> '~. '"&  #f&wwx}'='='L'L'N'NOO' N
 r#   gmc                     i }| j         j        D ]J}|j                            dd           }t	          |t
          j                  r|j        |vr
|||j        <   K|S )Nr   )graphr*   r   getr,   r-   r.   device)rE   device_node_mappingr>   ts       r!   get_device_node_mappingrL   ]   sh     >@X^ . .FJJud##a&& 	.18;N+N+N,-)r#   	aot_model	num_fixedc                     t          | j                  t          t          |                    z
  }|sd S t	          | j                  }t          ||          S N)rD   rG   r)   ranger   r   )rM   rN   mutation_indicesplaceholderss       r!   3check_for_mutation_ignore_cuda_graph_managed_tensorrT   h   sV     ,IO<<s5CSCS?T?TT t'	88L#L2BCCCr#   c                     t           j        st          | |          x}r|S t          t	          |                     x}r|S t          |           x}rt          d|j         d          S d S )Nzincompatible op ())r   (cudagraph_backend_support_input_mutationrT   r   rL   r   r   r7   )rM   rN   mut_skipskipnodes        r!   check_for_skipr[   s   s    : Jy
 
 
8 	 O6	**  t  4Y???t M*+Kty+K+K+KLLL4r#   c                 |    t          t          t          |                               }|j        dk    sJ |j        S )Ncuda)nextiterrL   typeindex)rE   rI   s     r!   get_device_indexrb      s;    $.r223344F;&    <r#   c                     t          |           }t          |j                  dk    sJ |j        d         }t          |d          sg S d |D             S )Nr   r   __iter__c                 h    g | ]/}t          |t          j        j        j                  r|j        nd 0S rP   )r,   r-   fxrZ   Nodestack_trace).0rA   s     r!   
<listcomp>z$get_stack_traces.<locals>.<listcomp>   sD        'sEHM,>??	IT  r#   )r   r5   r6   r1   )rE   outputr6   s      r!   get_stack_tracesrl      sk    __Fv{q    ;q>D4$$ 	    r#   dynamo_modeldynamo_inputsc           	         ddl m t          d          t          d           	 ddt          j        j        dt          t                   dt          dt          ffd	}dt          j        j        dt          t                   dt          ffd
}t          ||t          j        |d          t          j        j        j                  } ||           S )Nr   )cudagraphify_implTFrM   
aot_inputsis_inferencer   c                    t          | |          }t          t          
          t          |                    }t          | |          x}r(t	          j        	           t          d|            |S                     t          |                       ||t          |          j
        ddt          |           t          | j                  t          | j                  	  	        }d|_        |S )Nskipping cudagraphs due to Fdevice_indexis_backwardrr   stack_tracesrS   mutated_input_idxsT)r
   r   r5   r[   r   disabler   r)   rb   rQ   valuerl   r   rG   rD   _boxed_call)rM   rq   rr   interpfixedskip_msgoutboxed_device_indexrp   do_cudagraphsrn   s          r!   forward_cudagraphsz&cudagraphs.<locals>.forward_cudagraphs   s    
 9j11&s='9'93z??KK%i7778 	m,,,/8h88   M/	::;;;%LL+1))44-io>>3IODD

 

 

 
r#   c                 "    t           |          }s S t                     }t           |          x}rpt          d|            	j        }|d}t
          j        j                            |d          J dt          t                   dt          f fd}d|_        |S  
||t          |          t                     ddt                     t           j                  t#           j                  		  	        }d|_        |S )
Nrt   r   F)create_if_none_existsr;   r   c                 B                                       |           S rP   )set_to_running_backward)r;   rM   managers    r!   fnz3cudagraphs.<locals>.backward_cudagraphs.<locals>.fn   s%    //111 y(((r#   Tru   )r
   r   r[   r   r{   r-   	_inductorcudagraph_treesget_managerlistr   r|   rQ   rb   rl   r   rG   rD   )rM   rq   r}   r~   r   
device_idxr   r   r   r   rp   r   s   `       @r!   backward_cudagraphsz'cudagraphs.<locals>.backward_cudagraphs   sZ    9j11 	y))%i7778 	/8h88  
 ,1J!
o5AA% B  G &&&)49 ) ) ) ) ) ) ) ) "BNI%LL))44))44-io>>3IODD

 

 

 
r#   )rr   )fw_compilerbw_compilerinference_compilerkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesrp   r   r   r-   rf   GraphModuler   r   boolr	   	functoolspartial_dynamor   %cudagraph_backend_keep_input_mutation)rm   rn   r   r   aot_cudagraphsr   rp   r   s    `   @@@r!   
cudagraphsr      s.   AAAAAAdOOM)$//
 # 8'I  
	        :)8')59#Y)	) ) ) ) ) ) ) )V "&'$,-?dSSS',}';'a	  N >,666r#   c                   n    e Zd ZdZedd            Zedej        j        de	e
         de
fd            ZdS )	CudagraphsBackendr   r   Nc                  &    ddl m}   |              d S )Nr   reset_cudagraph_trees)r   r   r   s    r!   resetzCudagraphsBackend.reset   s)    IIIIIIr#   modelr;   c                 "    t          | |          S rP   )r   )r   r;   s     r!   __call__zCudagraphsBackend.__call__   s    %(((r#   )r   N)__name__
__module____qualname__compiler_namestaticmethodr   r-   rf   r   r   r   r   r    r#   r!   r   r      su         M      \ 
 ), )hsm ) ) ) ) \) ) )r#   r   r   )r7   compiler_fnTr   .r;   copy_outputscopy_inputsc                    t          |t          t          f          sJ rd |D             nt          |          t          j                                         t          j                                        }|                    t          j                                                   t          j        	                    |          5   | |  ddd           n# 1 swxY w Y   |                                 t          j                                                            |           t          j                                         t          j        
                                t          j                            |          5   |  ddd           n# 1 swxY w Y   t          t          t          f          sfdt          dt          t                   ffd}|S )zBThis isn't registered as a backend, but is used in some benchmarksc                 6    g | ]}t          j        |          S r    )r-   
zeros_likeri   xs     r!   rj   z$cudagraphs_inner.<locals>.<listcomp>
  s#    ===)!,,===r#   N)stream
new_inputsr   c                      t                    t          |           k    sJ r+t          |           D ]\  }}|                    |                                            rd D             S S )Nc                 6    g | ]}|                                 S r    )cloner   s     r!   rj   z1cudagraphs_inner.<locals>.run.<locals>.<listcomp>&  s     666!AGGII666r#   )r5   zipcopy_replay)r   dstsrcr   r   rG   static_inputsstatic_outputss      r!   runzcudagraphs_inner.<locals>.run  s    =!!S__4444 	z::  S		# 	"66~6666!!r#   )r,   r   tupler-   r]   synchronizeStreamwait_streamcurrent_streamr   	CUDAGraphrG   r   r   )	r   r;   r   r   r   r   rG   r   r   s	     ``  @@@r!   cudagraphs_innerr     sN    ftUm,,,,, %==f===V 
JZ  F
uz0022333			6	"	"  v              
	J++F333	J J  ""E			%		/	/ / /./ / / / / / / / / / / / / / /ntUm44 +(*	" 	"# 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" Js$   C  C$'C$FF#&F#)TT)7__doc__r   collectionsr   collections.abcr   typingr   r   r   r-   torch.fxtorch._dynamor   torch._dynamo.backends.commonr	    torch._dynamo.backends.debuggingr
   torch._inductor.cudagraph_utilsr   r   r   r   r   r   torch._inductor.utilsr   r   r   r   r    torch.multiprocessing.reductionsr   registryr   rf   Graphr)   intrD   r   r'   rI   rg   rL   r(   rT   r[   rb   r   rl   r   r   r   r   r    r#   r!   <module>r      s   .     # # # # # # $ $ $ $ $ $ * * * * * * * * * *               6 6 6 6 6 6 6 6 6 6 6 6                             < ; ; ; ; ; & & & & & &$EHN $s3x $ $ $ $N	%,
%&   Dx#D03Dc]D D D Deh2 s xPS}    $- #    	- 	$x}2E 	 	 	 	T7UX1 T7(3- T7TW T7 T7 T7 T7n) ) ) ) ) ) ) )   l0A0A0C0C D D D D 	) )CH)SM) ) 	)
 c8C= !) ) ) ) ) )r#   