
    `i                    B   U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Zd dl
mZ d dlmZmZ d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmc mZ d dlmZm Z  d dl!m"Z" d dl#m$Z% d dl&m'Z' d d	l(m)Z) d d
l*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> ddl?m$Z$ ddl@mAZA ddlBmCZCmDZDmEZE ddlFmGZG ddlHmIZImJZJ ddlKmLZL ddlMmNZNmOZO ddlPmQZQmRZRmSZS erd dlTZTe$jU        ZVeWeXd<    ejY        eZ          Z[ej\        eXd<   ej]        j^        Z^ej]        j_        Z_e G d d                      Z`e G d d                      Zae G d d                       Zbd!ejc        d"eWfd#Zdd$eje        d"eWfd%Zfd$eje        d"eWfd&Zgd!ejc        d"ehfd'Zi G d( d)          Zj ej            Zk	 dd*ejl        d+emejc                 d,emejc                 d-emeI         d.een         d"ejl        fd/Zod!ejc        d"eWfd0Zpd!ejc        d"eWfd1Zqd!ejc        d"eWfd2Zrd!ejc        d"eWfd3Zsd!ejc        d"eWfd4Ztd!ejc        d"eWfd5Zud!ejc        d"eWfd6Zvd!ejc        d"eWfd7Zwd!ejc        d"eWfd8Zxd!ejc        d"eWfd9Zyd:eje        d"ezemejc                 emejc                 emeI         emeI         f         fd;Z{d<emejc                 d=enfd>Z|d?eemejc                 ezejc                 f         d"ehfd@Z}	 	 ddCej        jl        d!ej        jc        dDe~dEe~fdFZdCej        jl        d!ej        jc        dGej        jc        dHej        dIe~dJe~d"ej        jc        fdKZdLej        d"e~fdMZd"emej                 fdNZd!ej        jc        d"eWfdOZd"ej        fdPZdQej        d"ezfdRZdCej        jl        d"dfdSZdCej        jl        d"dfdTZdUeje        dVeje        dWeenejc        f         d"dfdXZ	 dd<emejc                 dUeje        dVeje        dYee<ejc                          d"df
dZZdd[d:eje        d<emejc                 d\emejc                 d]ehdYee<ejc                          d"ezeje        eje        f         fd^Zddd_d:eje        d`eemeh                  dYee<ejc                          d"ezeje        eje        f         fdaZ ehdb          Zdcehd"ehfddZd!ejc        d"ehfdeZdCejl        fdfZej        dg             Zdheejc        ehf         d"emezejc        ehf                  fdiZdjeje        d"eje        fdkZdlej        je        dmej        je        dnej        jc        doej        jc        dpej        dqehdrej        jc        dsej        jc        fdtZd:eje        dleje        dmeje        duehd"ezeje        eje        f         f
dvZd:eje        d"dfdwZd:eje        d"dfdxZd:eje        d"eje        fdyZ	 dd*ejl        dzead{ebd|ee<ejc                          fd}Zd~ Zd"e`fdZdCejl        fdZd*ejl        deme~         deme~         de~dzeademejc                 d"eze~emeh         emeh         f         fdZd dlmZ dej        dehd"ej        fdZd Z	 dd*ejl        dzead"emejc                 fdZd*ej        jl        d<emej        jc                 fdZd Z	 dddd:eje        d`eemeh                  d"ezeje        eje        f         fdZ	 	 	 	 	 ddej        je        dendendeWdeeenemen         f                  deWdeen         d"dfdZdS )    Ndefaultdict)	dataclassreplace)AnyCallableOptionalTYPE_CHECKINGUnion)countersis_node_meta_valid)(create_structured_trace_for_min_cut_info)config)trace_structured)extract_tensor_metadata)BackwardState)is_sym_nodepy_sym_types)magic_methodsmethod_to_operator)find_symbol_binding_fx_nodesfree_symbolshint_intis_symbol_binding_fx_nodestatically_known_falsestatically_known_true)graph_drawer)
OrderedSet)CheckpointPolicy   )GraphInfoProvider)dp_knapsackgreedy_knapsackilp_knapsack)KnapsackEvaluator)	AOTOutputSavedForBackwardsAOTOutput)get_aot_graph_name)get_cuda_generator_meta_valis_with_effects)fx_graph_cseget_aten_targetraise_getitemsAOT_PARTITIONER_DEBUGlogc                       e Zd ZU dZee         ed<   ee         ed<   ee         ed<   ee         ed<   ee         ed<   dej        fdZ	dej        fd	Z
dej        fd
Zdej        fdZdej        fdZdS )OpTypesz8Class for keeping track of different operator categoriesfusible_opscompute_intensive_ops
random_opsview_opsrecomputable_opsnodec                 .    t          |          | j        v S N)r,   r2   selfr7   s     q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_functorch/partitioners.py
is_fusiblezOpTypes.is_fusibleN   s    t$$(888    c                 .    t          |          | j        v S r9   )r,   r3   r:   s     r<   is_compute_intensivezOpTypes.is_compute_intensiveQ   s    t$$(BBBr>   c                 .    t          |          | j        v S r9   )r,   r4   r:   s     r<   	is_randomzOpTypes.is_randomT   s    t$$77r>   c                 .    t          |          | j        v S r9   )r,   r5   r:   s     r<   is_viewzOpTypes.is_viewW   s    t$$55r>   c                 .    t          |          | j        v S r9   )r,   r6   r:   s     r<   is_recomputablezOpTypes.is_recomputableZ   s    t$$(===r>   N)__name__
__module____qualname____doc__r   r   __annotations__fxNoder=   r@   rB   rD   rF    r>   r<   r1   r1   D   s        BBH%%%%%h////8$$$$"""" ****9rw 9 9 9 9C C C C C8bg 8 8 8 86BG 6 6 6 6>BG > > > > > >r>   r1   c                   x   e Zd ZU eej                 ed<   eej                 ed<   eej                 ed<   eej                 ed<   eej        e	f         ed<   eej                 ed<   e
j        deej                 fd            Zd	ej        defd
Zd	ej        defdZd	ej        defdZd	ej        de	fdZdS )NodeInfoinputs_required_fw_nodesrequired_bw_nodesunclaimed_nodesfw_orderstatic_lifetime_input_nodesreturnc                 J     t          d  j        D              fd          S )Nc              3      K   | ]}|V  d S r9   rN   .0ns     r<   	<genexpr>z-NodeInfo.required_fw_nodes.<locals>.<genexpr>m   s"      001Q000000r>   c                     j         |          S r9   )rU   )r\   r;   s    r<   <lambda>z,NodeInfo.required_fw_nodes.<locals>.<lambda>m   s    a@P r>   key)sortedrR   r;   s   `r<   required_fw_nodeszNodeInfo.required_fw_nodesj   s:    00/0006P6P6P6P
 
 
 	
r>   r\   c                     || j         v S r9   )rR   r;   r\   s     r<   is_required_fwzNodeInfo.is_required_fwp   s    D+++r>   c                     || j         v S r9   )rS   rf   s     r<   is_required_bwzNodeInfo.is_required_bws   s    D***r>   c                     || j         v S r9   )rT   rf   s     r<   is_unclaimedzNodeInfo.is_unclaimedv   s    D(((r>   c                 J    || j         v sJ d| d            | j        |         S )NNode z not in fw nodes!)rR   rU   rf   s     r<   get_fw_orderzNodeInfo.get_fw_ordery   s7    D++++-IQ-I-I-I+++}Qr>   N)rG   rH   rI   listrL   rM   rK   r   dictint	functoolscached_propertyrd   boolrg   ri   rk   rn   rN   r>   r<   rP   rP   ^   sL         M"27++++!"'****((((27C<    !+BG!4444
4= 
 
 
 

, ,D , , , ,+ +D + + + +)bg )$ ) ) ) ) bg  #            r>   rP   c                   B    e Zd ZU eed<   eed<   eed<   eed<   eed<   dS )MinCutOptionsban_if_used_far_apartban_if_long_fusible_chainsban_if_materialized_backwardban_if_not_in_allowlistban_if_reductionN)rG   rH   rI   rt   rK   rN   r>   r<   rv   rv   ~   sN          $$$$"&&&&!!!!r>   rv   r7   rW   c                 h    | j                             dd           t          j        t          j        fv S )N	recompute)metagetr   MUST_RECOMPUTEPREFER_RECOMPUTEr7   s    r<   must_recomputer      s0    9==d++')0  r>   fx_gc                 H    | j         j        D ]}t          |          r dS dS )NTF)graphnodesr   r   r7   s     r<   has_recomputable_opsr      s7    
   $ 	44	5r>   c                     | j         j        D ]F}t          |          r5t          |j        d          r t
          j        j        |j        j        v r dS GdS )NtagsTF)	r   r   r   hasattrtargettorchTagnondeterministic_seededr   r   s     r<   has_recomputable_rng_opsr      s^    
   4  	V,,	 	1T[5EEE445r>   c                     t          | j        d         t          j        t          j        f          rdS t          | j        d         t          j                  sJ dS )Nvalr       )
isinstancer~   r   SymIntSymBoolSymFloatr   s    r<   sym_node_sizer      sM    $)E"U\5=$ABB qdi&777771r>   c                       e Zd Zd ZdS )InvalidNodeBasec                     dS )NzInvalid NoderN   rc   s    r<   __repr__zInvalidNodeBase.__repr__   s    ~r>   N)rG   rH   rI   r   rN   r>   r<   r   r      s#            r>   r   joint_graphrQ   outputsoutputs_descssubgraphc                 f   t          j                    }i |D ]-}|                    |j                  }|j        |_        ||<   .| j        D ]}t          |          r|dk    rt          |<   "t          |          r|dk    rt          |<   B|v rG|j	        dk    rt          |<   ]|j	        dk    r`t          j        |j        i |j        }fd|D             }t          |          rt          |<   |                    |fd          |<   |j	        dk    r|                    |fd          |<   |j	        d	k    r	 g }	|D ]}
t!          |
t           j                  r\|
vrt%          d
|
 d          t!          |
         t&                    rJ d
|
 d            |	                    |
                    x|	                    |
           |                    t-          |	                    }||j        d<   |                                 |                                 |S )a  
    Given a graph, extracts out a subgraph that takes the specified nodes as
    inputs and returns the specified outputs.

    This includes specifying non-placeholder nodes as inputs.

    The general strategy is to initialize all inputs with proxies as we
    encounter them, and trace through the graph, only keeping values which take
    in valid proxies. Then, all dead code is eliminated.
    backwardforwardplaceholdercall_functionc                 z    g | ]7}t          |t          j                  t          |         t                    8S rN   )r   rL   rM   r   )r[   xenvs     r<   
<listcomp>z6_extract_graph_with_inputs_outputs.<locals>.<listcomp>   sI       a))3q6?33  r>   c                     |          S r9   rN   r   r   s    r<   r_   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>       CF r>   get_attrc                     |          S r9   rN   r   s    r<   r_   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>   r   r>   outputrm   z couldn't be found in envz was invalid, but is outputdesc)rL   Graphr   namer~   r   _must_be_in_backwardInvalidNode_must_be_in_forwardoppytreearg_tree_leavesargskwargsany	node_copyr   rM   RuntimeErrorr   appendr   tupleeliminate_dead_codelint)r   rQ   r   r   r   	new_graphr7   new_nodeall_argsoutput_valuesr   outr   s               @r<   "_extract_graph_with_inputs_outputsr      s   " 

I
C   ((33	D		!  %% 	(j*@*@#CIt$$ 	Y)>)>#CI3;; W%%#CIIW''-tyHDKHHH   !  H
 8}} 'D	!++D2B2B2B2BCCCIIW
""!++D2B2B2B2BCCCIIW  M 	$ 	$a!! 	$||"#G1#G#G#GHHH!#a&/::  6666 :   Q((((  ####


5//
0
0C$CHV!!###NNr>   c                     | j         dk    o5dt          | j                  vot          |            ot	          |            S Nr   tangents)r   strr   _is_bwd_seed_offset_is_fwd_seed_offsetr   s    r<   
_is_primalr      sP    =  	*c$+...	*#D)))	* $D)))	r>   c                 D    | j         dk    odt          | j                  v S r   r   r   r   r   s    r<   _is_tangentr     s#    7m#F
c$+6F6F(FFr>   c                 p    | j         dk    o+dt          | j                  v pdt          | j                  v S )Nr   bwd_seedbwd_base_offsetr   r   s    r<   r   r   	  =    7m# c$+&&&O*;s4;?O?O*Or>   c                 p    | j         dk    o+dt          | j                  v pdt          | j                  v S )Nr   fwd_seedfwd_base_offsetr   r   s    r<   r   r     r   r>   c                 r    | j         dk    o,t          | j                            d          t                    S )Nr   r   )r   r   r~   r   r   r   s    r<   _is_backward_stater     s,    7m#W
49==3G3G(W(WWr>   c                 @    | j                             dd           dk    S )Npartitioner_tagis_backwardr~   r   r   s    r<   _has_tag_is_backwardr     s    9==*D11]BBr>   c                 @    | j                             dd           dk    S )Nr   must_be_in_forwardr   r   s    r<   _has_tag_must_be_in_forwardr     s    9==*D115IIIr>   c                 @    | j                             dd           dk    S )Nr   must_be_in_backwardr   r   s    r<   _has_tag_must_be_in_backwardr   !  s    9==*D115JJJr>   c                      t          |           S r9   )r   r   s    r<   r   r   %  s    &t,,,r>   c                 \    t          |           pt          |           ot          |           S r9   )r   r   r*   r   s    r<   r   r   )  s/    '-- T""<t'<'<r>   joint_modulec          	         t          j        d | j                            d          D              }t          j        t	          t          | j                            d                              j                            dd gt          |          z                      }|d |         }||d          }|d |         }||d          }||||fS )Nc              3   $   K   | ]}|j         V  d S r9   r   r[   r7   s     r<   r]   z+_extract_fwd_bwd_outputs.<locals>.<genexpr>3  s$      	K	K$)	K	K	K	K	K	Kr>   r   r   r   )	r   r   r   
find_nodesnextiterr~   r   len)r   num_fwd_outputsr   r   fwd_outputsbwd_outputsfwd_outputs_descsbwd_outputs_descss           r<   _extract_fwd_bwd_outputsr   /  s     $	K	K 2 = = = J J	K	K	KG *T,$//8/<<==>>CGGTFS\\)	
 	
 M
 *?*+K/**+K%&6&67%o&6&67%68IIIr>   saved_valuesr   c                 V    | D ]%}|j         |k    r|                     |            d S &d S r9   )r   remove)r   r   saved_values      r<   _remove_by_namer   A  sI    #  t##,,,EE $ r>   fwd_module_outputsc                     t          |           }t          t          |           dz
  dd          D ]}t          | |                   s|dz   } n|S )Nr    )r   ranger   )r   idxis      r<   find_first_sym_noder  H  sk      
!
!C3)**Q.B77  -a011 	a%CE	 Jr>         @-q=r   maxminc           	      
   |                      |          5  |                     t          j        j        j        j        |f          }t          j        j        j                            |j        d                   |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j        j	        j        |dgdf          }t          j        j        j	                            |j        d         dgd          |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j
        j        j        |t          j        f          }t          j        j
        j                            |j        d         t          j                  |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j        j        j        ||f          }t          j        j        j                            |j        d         |          |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j        j        j        |f          }t          j        j        j                            |j        d                   |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j        j        j        ||f          }	t          j        j        j                            |j        d         |          |	j        d<   t          |	j        d                   |	j        d<   d d d            n# 1 swxY w Y   |                      |	          5  |                     t          j        j
        j        j        |	t          j        fdt%          |j                  z             }
t          j        j
        j                            |	j        d         t          j                  |
j        d<   t          |
j        d                   |
j        d<   d d d            n# 1 swxY w Y   |
S )Nr   r   tensor_metar   T
fp8_scale_r   r   )inserting_afterr   r   opsatenabsdefaultr~   r   amaxprimsconvert_element_typefloat64	clamp_min
reciprocalmulTensorfloat32r   r   )r   r7   r  r  abs_node	amax_nodeamax_64_nodeclamp_min_nodereciprocal_nodemul_node
scale_nodes              r<   calculate_quantization_scalingr"  S  s    
		t	$	$ U U&&IN& ' 
 
  %y~199$)E:JKKe'>x}U?S'T'Tm$U U U U U U U U U U U U U U U 
		x	(	( W W''IN'RD$' ( 
 
	 !&	 3 ; ;M% 2$!
 !
	u )@	u@U(V(V	}%W W W W W W W W W W W W W W W 
		y	)	) 

 

**IO08U]+ + 
 
 $)9?#G#O#ON5!5=$
 $
%  ,Ce$,
 ,
-(

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		|	,	, 

 

,,IN$,$ - 
 
 &+Y^%=%E%Ee$c&
 &
E" .E&.
 .
M*

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		~	.	. 

 

--IN%- " . 
 
 ',in&?&G&G&'
 '
U# /F '/
 /
]+

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
			/	/ U U&&IN%!3' ' 
 
  %y~188 ' 
  
e (?x}U?S'T'Tm$U U U U U U U U U U U U U U U 
		x	(	( 	Y 	Y((IO08EM*DI. ) 
 


 "'!E!M!MM% %-"
 "

 *AQVAW)X)X
&	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y s   BB22B69B6BE44E8;E8B&II
I
&BLLL$BO  OO BQ>>RRB<U&&U*-U*r!  
quant_typer  	clamp_maxc           	         |                      |          5  |                     t          j        j        j        j        |t          j        f          }t          j        j        j                            |j        d         t          j                  |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j
        j        j        ||f          }t          j        j
        j                            |j        d         |j        d                   |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j
        j        j        ||f          }t          j        j
        j                            |j        d         |          |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j
        j        j        ||f          }	t          j        j
        j                            |j        d         |          |	j        d<   t          |	j        d                   |	j        d<   d d d            n# 1 swxY w Y   |                      |	          5  |                     t          j        j        j        j        |	|fdt          |j                  z             }
t          j        j        j                            |	j        d         |          |
j        d<   t          |
j        d                   |
j        d<   d d d            n# 1 swxY w Y   |
S )Nr   r   r
  
fp8_quant_r  )r  r   r   r  r  r  r  r  r~   r   r  r  r  r  r$  r   r   )r   r7   r!  r#  r  r$  target_node_32scaled_target_nodeclamp_min_scaled_nodeclamp_max_scaled_nodequant_activation_nodes              r<   perform_quantizationr,    s    
		z	*	* 

 

,,IO08& - 
 
 &+Y_%I%Q%QIeem&
 &
E" .E&.
 .
M*

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		~	.	. 

 

"00IN% *- 1 
 
 */);)B)B&
(>*
 *
& 2I#E*2
 2
.

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		1	2	2 

 

 % 3 3IN$,$i0 !4 !
 !
 -2IN,D,L,L#E*I-
 -
"5) 5L!&u-5
 5
"=1

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		4	5	5 

 

 % 3 3IN$,'3 !4 !
 !
 -2IN,D,L,L!&u-y-
 -
"5) 5L!&u-5
 5
"=1

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		4	5	5 
 
 % 3 3IO08'4DI. !4 !
 !
 IO088%*51:  	"5)
 5L!&u-5
 5
"=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ! s^   B&CCC(BFFF1BIII/BLLL-B(O!!O%(O%tensorc                 b    |                                  }|                                 }||z  dz  S )z
    Calculate the size of a PyTorch tensor in megabytes (MB).

    Args:
        tensor (torch.Tensor): Input tensor

    Returns:
        float: Memory size in MB
    i   )numelelement_size)r-  num_elementsr0  s      r<   calculate_tensor_sizer2    s2     <<>>L&&((L<'K88r>   c                      t           j        j        j        d                             dd          } d |                     d          D             } | S )N!activation_quantization_aten_passallowed_dtypesztorch.bfloat16c                 j    g | ]0}t          t          |                    d           d                   1S ).r   )getattrr   split)r[   dtypes     r<   r   z&get_allowed_dtypes.<locals>.<listcomp>  s@       16u{{3''+,,  r>   ;)r   	_inductorr   post_grad_fusion_optionsr   r9  )r5  s    r<   get_allowed_dtypesr>    s_    _+D+	c
,--  :H:N:Ns:S:S  N r>   c                 <   t                      }t          |           r| j        d         j        |vrdS t          j        j        j        d                             dd          }t          | j        d                   }t          j        j        j        d                             dd          s||k    S t          j        j        j        d                             dd          r't          ||k              pt          ||k               S t          ||k              S )Nr   Fr4  
size_in_mbd   skip_dynamo_guardsquantize_dynamic_shape)r>  r   r~   r:  r   r<  r   r=  r   r2  r   r   )r7   r5  size_thresholdr@  s       r<   should_quantizerE     s   '))Nd## ty'7'=^'S'Su_+D+	c,  'ty'788J?!:+	c
&&G ^++ ?!:/

#&
.
.	G )n,  J+J.,HIIIJ
 )~)EFFFr>   c                      t           j        j        j        d                             dd          } t          t           |                     d          d                   S )Nr4  r#  ztorch.float8_e5m2r7  r   )r   r<  r   r=  r   r8  r9  )r#  s    r<   get_quant_typerG    sO    '@+	c,+,,  5***3//3444r>   r:  c                 F    t          j        |           }|j        |j        fS )z
    Calculate the range of values for a given torch.dtype.
    Args:
        dtype (torch.dtype): The input dtype.
    Returns:
        tuple: A tuple containing the minimum and maximum values.
    )r   finfor  r  )r:  infos     r<   calculate_rangerK  "  s"     ;uD8TXr>   c           
         |                      d          d         }|j        d         }t                      }t          |          \  }}t	                      g g }}|D ]}|j                            dd          rjt          j        j	        j
        d                             dd          rat          | ||d	          }	t          | ||	|||          }
t          |	          s|                    |	           n|                    |	           n|                     |          5  |                     t          j        j        j        j        ||fd
t+          |j                  z             }
t          j        j        j                            |j        d         |          |
j        d<   t/          |
j        d                   |
j        d<   d d d            n# 1 swxY w Y   |
|<   fd|D             }t1          |          }||z   }|r|d |         |z   ||d          z   }|                    dt5          |                     t6          d         dxx         dz  cc<   d S )Nr   r   r   saved_for_quantizationFr4  use_scalingTr  r&  r  r   r
  c                 ,    g | ]}|v r|         n|S rN   rN   )r[   r7   node_to_quants     r<   r   z*quantize_activation_fw.<locals>.<listcomp>Z  s:       CGt}44d$  r>   inductor%activation_quantization_fwd_aten_passr    )r   r   rG  rK  rp   r~   r   r   r<  r   r=  r"  r,  r   r   r  r   r  r  r  r  r   r   r   r  
update_argr   r   )r   r   r   r#  r  r$  tensor_scale_nodessym_scale_nodesr7   r!  
quant_nodeoutput_updated_argsr  scale_nodesrP  s                 @r<   quantize_activation_fwrY  .  s   **1-F+a.K!!J*:66IyFFM*,b #- #-9==1599 !	-%>3c-&& <4E 
 24ZI 
 #:.. 7&--j9999#**:6666 **400  !&!4!4	<D"J/)C	NN: "5 " "J 	<DD Ie,j  OE*
 6M".6 6JOM2               #-M$   KV   1
2
2C$6K 
%36I#$$6OO 	 a233444Z@AAAQFAAAAAs   $B(GG	G	c           
      
	  	 d | j         D             }d }|D ]O}|j                            dd          r0|j                            d           |j                            d          }t          j        j        j        d                             dd          r|                     |          5  d|j	        
                    dd	          z   	t          	fd
|D                       }d d d            n# 1 swxY w Y   |                     |          5  |                     t          j        j        j        j        ||f          }t          j        j        j                            |j        d         |          |j        d<   t#          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                     |          5  |                     t          j        j        j        j        ||f          }t          j        j        j                            |j        d         |j        d                   |j        d<   t#          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                     |          5  |                     t          j        j        j        j        ||f          }t          j        j        j                            |j        d         |          |j        d<   t#          |j        d                   |j        d<   d d d            n# 1 swxY w Y   n|                     |          5  |                     t          j        j        j        j        ||fdt+          |j	                  z             }t          j        j        j                            |j        d         |          |j        d<   t#          |j        d                   |j        d<   d d d            n# 1 swxY w Y   t-          |j                                                  D ]$}||k    r||k    r|                    ||           %Qt4          d         dxx         dz  cc<   d S )Nc                 (    g | ]}|j         d k    |S )r   r   r   s     r<   r   z*quantize_activation_bw.<locals>.<listcomp>j  s$    JJJ$M1I1I1I1I1Ir>   rM  Fdequant_typer4  rN  r  r&   c              3   2   K   | ]}|j         k    |V  d S r9   r   )r[   	bwd_input
scale_names     r<   r]   z)quantize_activation_bw.<locals>.<genexpr>x  s<       & &%$>Z77 "7777& &r>   r   r   r
  dequant_r  rQ  %activation_quantization_bwd_aten_passr    )r   r~   r   popr   r<  r   r=  r  r   r   r   r   r  r  r  r  r   r  divr  r   ro   userskeysreplace_input_withr   )
r   	bw_inputsactivation_noder7   r\  r!  divided_target_node_32dequant_nodeuserra  s
            @r<   quantize_activation_bwrn  i  s   JJ%+JJJIO H@ H@9==1599 G	@IMM23339==88L%>3c-''? **400  !-	0A0A,PR0S0S!SJ!% & & & &)2& & & " "J               **:66  &+&9&9	<D"L1 ': ' 'O
 	<DD Ie,l  $(/
 ;R',U3; ;O(7               **?;; 
 
-2-@-@	*1-z: .A . .* :?9K9R9R',U3Z_U5K: :*/6 00F0KE0RSS +/>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 **+ABB  #(#6#6	<D4lC $7 $ $L
 	<DD27>  !%e,
 8O$)%08 8L%m4               **400  #(#6#6	<D"L1'#di..8 $7 $ $L 	<DD Ie,l  !%e,
 8O$)%08 8L%m4               TZ__..// @ @<''DO,C,C++D,???Z@AAAQFAAAAAs^   0:C66C:	=C:	BF44F8	;F8	BI==J	J	BL;;L?	L?	B(PP	P	
fwd_module
bwd_modulebwd_module_inputsc                     t          dd  fd           t           j                   t          dd  fd           t          dd fd            j                            d	
          d         j        d         }|D ]}d|j        v r||j                            dd                   }j                            |          5  j                            |j                  }d d d            n# 1 swxY w Y   |j	        d         }|j	        
                    |j	                   d|j	        d<   ||j	        d<   |                    |           j                            |           t          j        j        j        d                             dd          rt%          j                            d
                    }|d         }	t'          |          D ]}
t)          |
          s|
}	 n j                            d	
          d         j        d         }|D ]~}d|j        v rsj                            |	          5  j                            |j                  }d d d            n# 1 swxY w Y   |j	        
                    |j	                   |}	t+          j                   t          dd fd           d S )Nartifactc                      dddS )N,before_activation_quantization_fwd_aten_passstringr   encodingrN   rN   r>   r<   r_   z5perform_fp8_activation_quantization.<locals>.<lambda>      B 
 
 r>   c                  4                          ddd          S NFT)print_outputinclude_strideinclude_deviceprint_readablero  s   r<   r_   z5perform_fp8_activation_quantization.<locals>.<lambda>  #    :44tD 5 
 
 r>   metadata_fn
payload_fnc                      dddS )N+after_activation_quantization_fwd_aten_passrv  rw  rN   rN   r>   r<   r_   z5perform_fp8_activation_quantization.<locals>.<lambda>      A 
 
 r>   c                  4                          ddd          S r{  r  r  s   r<   r_   z5perform_fp8_activation_quantization.<locals>.<lambda>  r  r>   c                      dddS )N,before_activation_quantization_bwd_aten_passrv  rw  rN   rN   r>   r<   r_   z5perform_fp8_activation_quantization.<locals>.<lambda>  ry  r>   c                  4                          ddd          S r{  r  rp  s   r<   r_   z5perform_fp8_activation_quantization.<locals>.<lambda>  r  r>   r   r   r   r&  r]  r_  r\  TrM  r4  rN  r   r   r  c                      dddS )N+after_activation_quantization_bwd_aten_passrv  rw  rN   rN   r>   r<   r_   z5perform_fp8_activation_quantization.<locals>.<lambda>  r  r>   c                  4                          ddd          S r{  r  r  s   r<   r_   z5perform_fp8_activation_quantization.<locals>.<lambda>  r  r>   )r   rY  r   r   r   r   r   r  r   r~   updatereplace_all_uses_with
erase_noder   r<  r   r=  r   ro   reversedr   rn  )ro  rp  rq  quant_fwd_module_outputsfwd_noder`  quant_bwd_inputr\  quant_bwd_module_inputsbwd_input_locbw_inputscaled_fwd_module_outputsscale_bwd_inputs   ``           r<   #perform_fp8_activation_quantizationr    s   
 
 

 
 
 
	 	 	 	 :+,,,
 

 
 
 
	 	 	 	 
 

 
 
 
	 	 	 	  */::h:GGJOPQR, 
3 
38=(()(-*?*?b*Q*QRI!11)<< S S","2">">HM">"R"RS S S S S S S S S S S S S S S$>.9L ''666=AO !9:3?O 0++O<<<''	2226+	c-0 #'z'7'B'Bm'B'T'T"U"U/3 !899 	 	Hx((  ( %/$4$?$?8$?$L$LQ$O$TUV$W!1 	0 	0Hx},,%55mDD W W&0&6&B&B&B&V&VOW W W W W W W W W W W W W W W$++HM::: /:+,,,
 

 
 
 
	 	 	 	 	 	s$   !C::C>	C>	!I;;I?	I?	rV   c                    t           j                            dd           	 d S |rd |D             ng }d | D             }t          j        j        j        d                             dd          rd | D             }|j                            d          d	         j        d	         }d
 |j                            d          D             }d}|D ]}	|	j	        |v rt          |	          r|	j	        |v r!t                              d|	j	                   Dd|	j        d<   |	j        d         j        |	j        d<   d||	j	                 j        d<   |	j        d         j        ||	j	                 j        d<   d}|rt          |||           d S d S )Nr4  c                     g | ]	}|j         
S rN   r_  r   s     r<   r   z2enable_activation_quantization.<locals>.<listcomp>   s    ;;;t;;;r>   c                     i | ]
}|j         |S rN   r_  r   s     r<   
<dictcomp>z2enable_activation_quantization.<locals>.<dictcomp>$  s    CCCd$)TCCCr>   exclude_primalsFc                 0    i | ]}d |j         v|j         |S )primalsr_  r   s     r<   r  z2enable_activation_quantization.<locals>.<dictcomp>(  s/     
 
 
 $	8R8RDIt8R8R8Rr>   r   r   r   c                     i | ]
}|j         |S rN   r_  r   s     r<   r  z2enable_activation_quantization.<locals>.<dictcomp>,  s)        	4  r>   r   z*Skipping quantization of static input %s: TrM  r   r\  )inductor_configr=  r   r   r<  r   r   r   r   r   rE  r/   debugr~   r:  r  )
r   ro  rp  rV   static_input_namessaved_values_namesr   rq  should_perform_fp8_quantr7   s
             r<   enable_activation_quantizationr    s    	044/	
 	
 	
 	 '	;;:;;;; 
 DClCCC6+	c
U##

 
(4
 
 
 $)444AA!DI!L $.$4$?$?=$?$Q$Q    %" 
, 
,9***t/D/D*y...		F	RRR26DI./(,	%(8(>DIn%JNdi(-.FG@D	%@P@Vdi(-n='+$ W+J
DUVVVVVW Wr>   )rV   saved_sym_nodesr   c                ,   t          | |          \  }}}}| j                            d          }	g t          t          |	          }
g t          t
          |	          }g t          t          |	          }g t          t          |	          }g t          t          |	          }t          | j        ||z   |z   |z   ||d          }t          j                                        }|                    d          D ]}|j        s+t          ||j                   t          ||j                   4|rIt!          d |j        D                       r+t          ||j                   t          ||j                   t          |          rt          ||j                   |sJ t#                      }g }g }|D ]S}t%          |          }|r+|                    |           |                    |           >|                    |           Tt+          | j                  }t-          j        |||          D ]c}d|j        vrt3          |j        d                   |z
  }t5          |d           D ]"}||vr|                    ||                    #||z  }d|                                 |                    ||z              t          | j        |
|z   ||z   |z   |d	 t;          t=          |          t=          |          z             D             z   d
          }t          | j        ||z   |z   |z   |z   ||d          }t>          j         !                    | |          }t>          j         !                    | |          }tE          ||||           ||fS )Nr   r   r   r   c              3      K   | ]>}|j         t          j        j        j        j        u ot          |j                  d k    V  ?dS r   N)r   r   r  _c10d_functionalwait_tensorr  r   rf  rZ   s     r<   r]   z+_extract_fwd_bwd_modules.<locals>.<genexpr>e  s_       )
 )
  H	2>FF "AG!)
 )
 )
 )
 )
 )
r>   r   c                     | j         S r9   r_  )ss    r<   r_   z*_extract_fwd_bwd_modules.<locals>.<lambda>  s    16 r>   r`   c                 ,    g | ]}t          |          S rN   )r'   r[   r  s     r<   r   z,_extract_fwd_bwd_modules.<locals>.<listcomp>  s0     
 
 
 'q))
 
 
r>   r   )#r   r   r   filterr   r   r   r   r   r   r   distributedis_availablerf  r   r   allr   r   addr   r   	itertoolschainr~   r   rb   clearextendr  r   rL   _lazy_graph_module_make_graph_moduler  )r   r   r  r   rV   r   r   r   r   placeholdersprimal_inputstangent_inputsfwd_seed_offset_inputsbwd_seed_offset_inputsbackward_state_inputs	bwd_graphdistributed_enabledr7   saved_symbolssaved_sym_nodes_bindingsaved_sym_nodes_derivedsymbolsymbol_bindingsnew_symbolsr  	fwd_graphro  rp  s                               r<   _extract_fwd_bwd_modulesr  @  sK    	!OOO CK/1B  %00M0BBL7fZ667M9vk<889NIv&9<HHIIv&9<HHIGf%7FFG2,&7:PP I  +88::$$$66 ) )z 	)L$)444OTY7777
 ! 
	)S )
 )
 Z)
 )
 )
 &
 &
 
	)
 L$)444OTY7777%% 	)L$)444(((( /9llM     1 1*400 	1f%%%#**40000#**40000 3<3EFFO 7~VV % %	!!"49U#344}D)9)9::: 	? 	?A ''#**?1+=>>>>$
 25LLMMM 3..l"_4
 
3|,,s?/C/CCDD
 
 
	

 	
 
I 3
	
	 !	!  		 
 	
 
I &99,	RRJ&99,	RRJ"j*.I   z!!r>   )static_lifetime_input_indicesrV   r  c                   t          |           rt          | |||          S t          t          t          | j        j                            }t          t          t          | j        j                            }||z   }t          | |          \  }}	}
}t          | j        |||
d          }t          d |j        D                       g }g }| j        j        D ]}|j        vrt          |          r|                    |           1d|j        vrC|j        dk    r8|j        }t#          d |D                       sJ |                    |           }fd|j        D             }d|j        v r/t#          d	 |D                       r|                    |           |                    |           t          t&                              |                                                    }t          t&                              |                                                    }t-          | ||||
          S )a  
    Partitions the :attr:`joint_module` in a manner that closely resembles the
    behavior observed in the original ``.forward()`` and ``.backward()`` of the
    callable, i.e., the resulting forward graph contains those operators that
    are executed in the original ``.forward()`` callable passed to
    :func:`aot_function`.

    The default partitioner collects the operators that are between the forward
    inputs and the forward outputs. This helps in finding the tensors which have
    to be stashed for the backward pass. These stashed tensors become the output
    of the generated forward graph. The remaining operators are then placed in
    the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    )r   r  r  r   c              3   :   K   | ]}|j         d k    |j        V  dS r   Nr   r   r   s     r<   r]   z$default_partition.<locals>.<genexpr>  s9       $ $$'X:M:M	:M:M:M:M$ $r>   r
  r   c              3   @   K   | ]}|j         t          j        k    V  d S r9   )r   operatorgetitemr[   rm  s     r<   r]   z$default_partition.<locals>.<genexpr>  s,      II4t{h&66IIIIIIr>   c                 &    g | ]}|j         v|S rN   r_  )r[   r\   forward_node_namess     r<   r   z%default_partition.<locals>.<listcomp>  s-       7I)I)I)I)I)Ir>   c              3   4   K   | ]}t          |          V  d S r9   r   rZ   s     r<   r]   z$default_partition.<locals>.<genexpr>   s9       2 2#$A2 2 2 2 2 2r>   r  r   rV   )r   #min_cut_rematerialization_partitionro   r  r   r   r   r   r   r   r   r   r   r   r~   r   rf  r  r  rp   fromkeysrg  r  )r   _joint_inputsr   r  rV   r  r  rQ   r   r   r   r   forward_only_graphr   r  r7   rf  backward_usagesr  s                     @r<   default_partitionr    s   > L)) 
2+*G	
 
 
 	
 
L,>,DEEFFM!&)<l>P>V"W"WXX33F OOO CK/1B <FK1BI  $ $ $06$ $ $   LO"( * *9...t 	* ""4(((($)++?0J0JJEII5IIIIIIII&&&&   :  O 	))c 2 2(72 2 2 / /)  &&7777##D))))l3388::;;L4==99>>@@AAO#''$?   r>   g    .Ar/  c                     | |j         z  S r9   )itemsize)r/  r:  s     r<   _tensor_nbytesr    s    5>!!r>   c                 x   dt           fdd| j        v r| j        d         }t          |t                    rdS t          |t          t
          f          rt          fd|D                       S t          |t                    r-t          fd|                                D                       S t          |t          j
                  r |          S t          dt          |           d|            | j        d	k    s"| j        t          j        j        j        j        u rd
S t          d|  d          )NrW   c                     t          | t          j                  sdS t          t	          |                                 d          | j                  S )Nr      fallback)r   r   r  r  r   r/  r:  r   s    r<   object_nbytesz_size_of.<locals>.object_nbytes"  sB    !U\** 	1hqwwyy4@@@!'JJJr>   r   r    c              3   .   K   | ]} |          V  d S r9   rN   )r[   r\   r  s     r<   r]   z_size_of.<locals>.<genexpr>/  s-      55A}}Q''555555r>   c              3   4   K   | ]\  }} |          V  d S r9   rN   )r[   _r\   r  s      r<   r]   z_size_of.<locals>.<genexpr>1  s1      @@DAq}}Q''@@@@@@r>   zUnknown metadata type z	 on node r   r   rm   zO didn't have `val` metadata; we should always have `val` metadata on the nodes.)rq   r~   r   r   ro   r   sumrp   itemsr   r  r   typer   r   r  r  _assert_scalarr  )r7   r   r  s     @r<   _size_ofr  !  s^   KC K K K K
 	ic<(( 
	&1 dE]++ 	&5555555555T"" 	&@@@@CIIKK@@@@@@U\** 	& =%%%NDIINNNNOOOw*uy~/L/T T Tq
eeee  r>   c           	      .   ddl m}  |t                    }| j        D ]'}|j        dk    r||j        j        xx         dz  cc<   (t                              dt          |
                                t          j        d          d                     d S )Nr   r   r   r    z%sTra   reverse)collectionsr   rq   r   r   r   rG   r/   rJ  rb   r  r  
itemgetter)r   r   cntr7   s       r<   
_count_opsr  >  s    ''''''%+c**C + +7o%%$%%%*%%%HHT6#))++8+>q+A+A4PPPQQQQQr>   c                  v   g } t          t          j        j                  D ]}t	          t          j        j        |          }t          |t          j        j                  sA|                                D ]A}t	          ||          }t          j	        j
        |j        v r|                     |            nB| S r9   )dirr   r  r  r8  r   _opsOpOverloadPacket	overloadsr   	pointwiser   r   )r  	attr_nameopoverloadpacketoverloadop_overloads        r<   pointwise_opsr	  H  s    
C(( 
 
	"59>9==*EJ,GHH 	(2244 	 	H!"2H==Ky"k&666

+,,, 7
 Jr>   	depth_mapc                     fd| D             }t          |                                t          j        d          d          S )Nc                 j    i | ]/}t          |t          j        j        j                  &||         0S rN   )r   r   rL   r7   rM   )r[   argr
  s     r<   r  zsort_depths.<locals>.<dictcomp>[  sE        #z#ux}?Q/R/RYs^  r>   r    Tr  )rb   r  r  r  )r   r
  
arg_depthss    ` r<   sort_depthsr  Z  sY       '+  J *""$$(*=a*@*@$OOOOr>   gmc                 6  	
 t          j                    
i 	| j                            d          D ]}
                    |	fd          	|<   d t          | j        j                  D             	
fd}t          t          t          | j        j                            }d}t          j        }|D ]"}|j        D ]}|         |k     r
|         }|}#|| S t          | j        j                  d|                  D ]<}|j        dk    r/|j        t          j        j        j        j        k    r ||           =t          | j        j                  |         d         D ]} ||           t          j                             | 
          }|S )a  
    This pass finds the first bwd node in the graph (by looking at users of
    tangents) and then reorders the graph by walking from this node to all the
    way to the end of the graph. At each op in this traversal, we insert this op
    in a new graph and try to bring only the relevant subgraph from the other
    non-bwd edges relevant for this op. This closely mimics the behavior of
    autograd engine.

    Why is this pass required in the first place?

    This is an artifact of how partitioners work today. The starting point of
    partitioner is a joint graph, which is fwd and then bwd graph. In the case
    of checkpointing, we keep portions of fwd graph in their original place in
    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
    graph has copies of recomputed fwd subgraphs followed by the original bwd
    graph. If we run this naively, this leads to bad memory footprint, because
    the fwd subgraphs are live for way longer duration than necessary. This pass
    reorders the operations such that we prioritize the ops for the original bwd
    graph while only realizing those ops from the fwd graph that are necessary
    at any given point in the graph.
    r   r   c                     |          S r9   rN   r   s    r<   r_   z5reordering_to_mimic_autograd_engine.<locals>.<lambda>}  s    A r>   c                     i | ]\  }}||	S rN   rN   r[   r  r7   s      r<   r  z7reordering_to_mimic_autograd_engine.<locals>.<dictcomp>  s    BBB93T3BBBr>   c                 X   | g}t                      }t          |          dk    rO|                                } | |v s| v r0|                    |            || j        z  }t          |          dk    Ot          |fd          }|D ]}                     | fd          | <   d S )Nr   c                     |          S r9   rN   )r\   orders    r<   r_   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>  s    %( r>   r`   c                     |          S r9   rN   r   s    r<   r_   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>  r   r>   )r   r   rd  r  all_input_nodesrb   r   )r7   	cur_nodesinsertable_nodesr   r   r  s      r<   insert_node_in_graphzAreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph  s    F	0:)nnq  ==??D'''43;;  &&& --I )nnq   ""28J8J8J8JKKK$ 	D 	DD!++D2B2B2B2BCCCII	D 	Dr>   Nr   )rL   r   r   r   r   	enumerater   ro   r  r   mathinfrf  r   r   r   r  r  copy_r  GraphModule)r  r7   r  r  first_node_in_bwdminimum_ordertangentrm  new_gmr   r   r  s            @@@r<   #reordering_to_mimic_autograd_enginer&  a  s   . 

I"$C ##}#55 @ @''.>.>.>.>??D		BB	"(.(A(ABBBED D D D D D D$ &bhn==>>NHM! ) )M 	) 	)DT{]** %d$(!	)  	 RX^$$%?u->'?%?@ ' '7o%%$+9M9U*U*U  &&&RX^$$U+<%=%?%?@ # #T"""" X!!"i00FMr>   	fw_module	bw_modulefw_nodebw_nodedevice	rng_countlast_fwd_inputlast_bwd_inputc                 n   |j         }|J | j        }	|j        }
t          j        j        j        }| j                            |          5  | j                            d|           }t          |          |j	        d<   |}ddd           n# 1 swxY w Y   |j                            |          5  |j                            d|           }t          |          |j	        d<   |}ddd           n# 1 swxY w Y   t          |j                  }||d<   | j                            |          5  |	                    d||j        g|j        R |          }ddd           n# 1 swxY w Y   |                    |           |	                    |           t          |j                  }||d<   |
                    |          5  |
                    d||j        g|j        R |          }|                    |           |
                    |           ddd           n# 1 swxY w Y   ||fS )a%  
    Note [CUDA Graph Safe RNG Functionalization]

    CUDA Graph capture doesn't work with get_rng_state and set_rng_state because these functions operate on CPU values,
    while CUDA Graph RNG capture uses on-device CUDA tensors. To solve this, we use graphsafe_set_state with a
    CUDA Generator registered to the CUDA Graph before capture begins. graphsafe_set_state updates the generator's pointer
    to reference a different GeneratorImpl, ensuring subsequent calls are correctly forwarded to the desired generator
    (and its cuda-tensor RNG state during graph capture).

    For each RNG operation's forward/backward pair:

    - We create two generators initialized with identical values
    - Each forward and backward call advances its respective generator equally
    - This keeps generators synchronized so forward and backward operations use matching RNG values

    When forward is called multiple times before backward (causing desynchronization):

    - We save the forward RNG state
    - We update the backward Generator's state before executing backward

    Before each CUDA Graph replay, replay_prologue updates captured RNG pointers with current states, ensuring backward Generator
    changes are reflected during replay.

    This function modifies both forward and backward computation graphs by:

    Creating RNG state placeholders for both passes
    Updating the forward node to use graph-safe RNG state
    Updating the backward node to use graph-safe RNG state

    For more details: https://github.com/pytorch/pytorch/issues/113541
    Nfwd_rng_state_r   bwd_rng_state_	rng_stater   r   r   )indexr   r   _prims	rng_primsgraphsafe_run_with_rng_stater  r   r)   r~   rp   r   create_noder   r   r  r  inserting_before)r'  r(  r)  r*  r+  r,  r-  r.  
device_idxfw_graphbw_graphr7  fwd_rng_statebwd_rng_state	fw_kwargsfunctional_fw_node
bwd_kwargs
rng_outputs                     r<   %apply_graphsafe_rng_functionalizationrC    sU   R J!!!HH#(<#9#V  
	(	(	8	8 ' '!334PY4P4PQQ$?
$K$K5!&' ' ' ' ' ' ' ' ' ' ' ' ' ' ' 
	(	(	8	8 ' '!334PY4P4PQQ$?
$K$K5!&	' ' ' ' ' ' ' ' ' ' ' ' ' ' ' W^$$I*Ik		(	(	1	1 
 
%11(.07<00	 2 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 !!"4555    gn%%J+J{		"	"7	+	+ % %))(.07<00	 * 
 

 	%%j111G$$$% % % % % % % % % % % % % % % >))sI   
7BBB27C55C9<C93(E''E+.E+
AH((H,/H,num_sym_nodesc                 p  ' t          j                    }d }dt          t          j                 fd'dt          t          j                 fd} ||           } ||          } ||          }	i }
| j        j        D ]r}t          |          rat          |j	        d          rLt          j
        j        |j	        j        v r/||j                 }||j                 }|	|j                 }||d|
|<   st          j        j        j        }t          j        j        j        }d }|j                            d	          D ]}d
|j        v r|} n|t'          d          g }t)          t+          |j                            d	                              }t)          t+          |j                            d	                              }t-          'fd|
                                D                       }|                    t          j        d                     t3          |          dk    }t          j        j        }t6          j        o| o|j         p|j        j        }tA          |
!                                          D ]Y\  }\  }}|d         }|d         } '|          }|j        }|j        }|r'|%|j"        dk    rtG          ||||||||          \  }}[|$                    |          5  |%                    d||j	        g|j&        R |j'                  }|%                    dtP          j)        |dfi           } ||          |j*        d<   |%                    dtP          j)        |dfi           } tW          j+        |j*                  | _*        |,                    |            |-                    |           |.                    |           d d d            n# 1 swxY w Y   |$                    |          5  dt)          |           }!|/                    |!          }" ||          |"j*        d<   d d d            n# 1 swxY w Y   |$                    |          5  |%                    d||"|j	        g|j&        R |j'                  } |,                    |            |-                    |           d d d            n# 1 swxY w Y   [|rt)          ta          |j                            d	                              }#|#j&        d         }$t3          |$          |z
  }%|$d |%         tc          |          z   |$|%d          z   }&|j        2                    |&           |j        -                    |#           |3                                 |3                                 ||fS )Nc                     i }| j         j        D ]I}|j        dk    r<t          |j        d          r't
          j        j        |j        j        v r
|||j	        <   J|S )Nr   r   )
r   r   r   r   r   r   r   r   r   r   )gmodrandom_nodesr7   s      r<   get_rng_opsz*functionalize_rng_ops.<locals>.get_rng_ops&  sd    J$ 	/ 	/D?**DK00 +I59III*.TY'r>   rW   c                     d| j         vrdS | j         d         }t          |t                    s|f}|D ]5}t          |t          j                  r|j        j        dk    r	|j        c S 6t          j        d          S )zV
        Check the example value of the node outputs to find the device type.
        r   Ncudacpu)r~   r   r   r   r  r+  r  )r7   
candidates	candidates      r<   
get_devicez)functionalize_rng_ops.<locals>.get_device1  s     	!!4Yu%
*e,, 	'$J# 	, 	,I)U\22 ,#(F22$++++|E"""r>   r+  c                 B   ddl m}  |            }|J |5  | H| j        dk    r=|                    t          j                                                  cd d d            S |                    t	          j                              cd d d            S # 1 swxY w Y   d S )Nr   )detect_fake_moderK  )torch._guardsrQ  r  from_tensorr   rK  get_rng_state)r+  rQ  	fake_modes      r<   get_sample_rng_statez3functionalize_rng_ops.<locals>.get_sample_rng_stateC  s)   222222$$&&	$$$ 	@ 	@!fkV&;&; ,,UZ-E-E-G-GHH	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ (()<)>)>??	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@s   >B"%BBBr   )fwdbwdr   r   r$  zaCouldn't find tangent node in graph inputs. This is unexpected, please file a bug if you see thisc              3   :   K   | ]} |d                    V  dS )rW  NrN   )r[   	node_pairrO  s     r<   r]   z(functionalize_rng_ops.<locals>.<genexpr>o  sC        )2

9U#$$     r>   rL  r    rW  rX  rK  r   r3  r   r   rng_state_output_r   )4r  countr	   r   r+  r   r   r   r   r   r   r   r   r   r5  r6  run_and_save_rng_staterun_with_rng_stater   r   r   r  r   valuesdiscardr   r<  r   graphsafe_rng_functionalizationfallback_randomtest_configs*graphsafe_rng_func_ignores_fallback_randomr  r  r  rC  r9  r8  r   r   r  r  r~   copyr  r  r   r   r   r   r   	recompile)(r   r'  r(  rD  uidrI  rV  joint_graph_rng_opsfw_graph_rng_opsbw_graph_rng_opsrecomputable_rng_ops_mapr7   	base_noder)  r*  run_and_save_rngr^  bw_tangent_start_nodefw_rng_state_outputsr-  r.  devicesmulti_cuda_devices
ind_config'use_rng_graphsafe_rng_functionalizationr,  rZ  r+  r;  r<  r@  staterB  
state_namebw_rng_state_nodefw_output_node
fw_outputssym_node_start_idxr   rO  s(                                          @r<   functionalize_rng_opsrz    sN   2 /

C	 	 	#HU\2 # # # #$@Xel%; @ @ @ @ &+l33"{9--"{9--!"( 	S 	S4  	SV,,	S 	1T[5EEE+DI6I&ty1G&ty1G:A'2R2R$Y/|-D/B **m*<<  	!!$(!E " $o
 
 	
 (9?#=#=#=#O#OPPQQN(9?#=#=#=#O#OPPQQN    6N6U6U6W6W    G OOEL''((( W) 'J. 	
""	
 ** R&Q , .7 &&((. . I- I-)	)Iy E"E"G$$?? 4>	-"v%%-R	. 	.*NNN **733 3 3%-%9%9#$!.87<88">	 &: & &" !,,#$,a0	 -   %9$8$@$@
5!%11#$*  2  
 #')GL"9"9
--j999##G,,,$++E222;3 3 3 3 3 3 3 3 3 3 3 3 3 3 3@ **+@AA M M<c<<
$,$8$8$D$D!0D0DV0L0L!&u-M M M M M M M M M M M M M M M
 **733 	- 	-%11#&+W^KglKK">	 2  
 --j999##G,,,	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	-  
3d9?#=#=#=#J#JKKLL#(+
 __}<***+())*+,,-. 	
 	w'''"">222is8   2C'O%%O)	,O)	;QQ	Q	,ASS	S	c                     | j         j        D ]Y}t          |j        t          j        j                  r3|j        j        dk    r#t          |          st          j
        |j        d<   ZdS )z
    By default, the partitioner is not allowed to recompute collectives
    unless they come from a user-annotated AC region.
    See Note [Recomputing collectives in the partitioner]
    r  r}   N)r   r   r   r   r   r  
OpOverload	namespacer   r   	MUST_SAVEr~   )r   r7   s     r<   force_save_collectivesr    st     "( @ @t{EJ$9::	@%);;;"4(( < &6%?DIk"@ @r>   c                    t                      }t          | j        j                  D ]}|j        dk    r|j        t          j        j        j	        j
        k    }|rmt          |          r |                    |j        d                    t          |          r.|j        d         |v rt          j        |j        d         j        d<    d S d S )Nr   r   r    r}   )r   r  r   r   r   r   r   r  r  r   r  r   r  r   r   r   r~  r~   )r   has_mutation_in_bwr7   is_copy_s       r<   force_save_bw_mutation_srcr    s     5?LL+122  7h;%)."6">> 	+D11 5"&&ty|444*400 LTYq\EW5W5W1A1K	!!+. EE! r>   c                 n   | j         j        D ]}t          |          r|j        D ]A}t          |          r0|j        d         |j        d         k    rt
          j        |j        d<   B|j                            dd          r2t          d |j        D                       st
          j        |j        d<   | S )a  
    If there are two consecutive checkpointed blocks with no operator in
    between, we would still want to stash the tensor at the boundary of
    checkpointed blocks. The following pass makes the last output node
    non-recomputable to allow for that.
    ac_graph_idr}   has_backward_hookFc              3   4   K   | ]}t          |          V  d S r9   )r   r  s     r<   r]   z)cleanup_recompute_tags.<locals>.<genexpr>  sC       E E)-t$$E E E E E Er>   )	r   r   r   rf  r~   r   r~  r   r   )r   r7   rm  s      r<   cleanup_recompute_tagsr    s     "( D D$ 	D
 H H"4((H	-049]3KKK-=-GDIk*y}}0%88 D E E15E E E B B D& *:)C	+&r>   	node_infomin_cut_optionsdont_banc                   $%&'()*+,-./ t                      t                      .t          rZt          d | j        D                       }|t          d .j        D                       z
  }t
                              d|           d %d &%&.fd'	 dd l}n"# t          $ r}t          d          |d }~ww xY w'.fd	)).fd
}'fd(dt          f(.fd}	|                                -t                      $$-.fd}
| j        D ]$}|j        dk    r|j        v r^|j        vr+-                    |j        dz   dt"          j                   L-                    |j        dz   dt"          j                   t'          |          r+-                    |j        dz   dt"          j                   t)          |          st+          |          r |
|                               |          r ||          r |
|           d|j        vod|j        vp.d|j        v o%t1          |j        d         t2          j                   }t7          |          rt          t9          |                    }nO|r<t1          |j                            d          t<                    rdnt"          j        }n |	|j                  }-                    |j        dz   |j        dz   |           |j         D ]4}-                    |j        dz   |j        dz   t"          j                   5&dtB          tD          j#                 dtH          dtH          f'fd}j%        rj&        D ]}fd|j         D             }fd|j         D             }tO          |          dk    r ||tQ          |                    }tS          |j                   D ]}                    |          rz*                    |          |k    ra '||          rU|$v rAt
                              d|*                    |          ||*                    |                      |
|           j+        rt                      }| j        D ]w}                    |          s*                    |          |fg}*                    |          }tO          |          dk    rtY          j-        |          \  }}||v r0|.                    |           *                    |          |dz   k    rctO          |          dk    rPt
                              d||*                    |          *                    |                      |
|           nm|j         D ]Q}                    |          r: '||          r.|$vr*tY          j/        |*                    |          |f           RtO          |          dk    y	 |0                    -dd          \  }}n# tb          $ ru t
                              d            t
                              d!2                    |j3        j4        5                    -                               tm          -            w xY w|\  },t                      }-fd"|D             D ]'\  /}|7                    ,/fd#|D                        (t                      }|D ]<\  } }!| d d$         |!d d%         k    sJ | d d$         }"|.                    |"           =tq          |           *d& ts          | j                  D             +tu          *fd'|D             +fd()          }#|#$fS )*Nc              3      K   | ]=}|j         d k    t          |j        d          "t          |j        j                  V  >dS )r   _overloadpacketN)r   r   r   r   r  r   s     r<   r]   z solve_min_cut.<locals>.<genexpr>7  sZ       &
 &
w/))gdkCT.U.U) +,,))))&
 &
r>   c              3   4   K   | ]}t          |          V  d S r9   )r   r  s     r<   r]   z solve_min_cut.<locals>.<genexpr><  s9       4
 4
CFF4
 4
 4
 4
 4
 4
r>   z&Ops banned from re-materialization: %sc                    |j         t          j        j        j        k    rdS |j        d         }t          j        j                            |          \  }}|D ]2}|j	        |         }| |u r dS t          |t                    r| |v r dS 3dS NFr   T)r   r   r  higher_orderauto_functionalizedr   _higher_order_opsauto_functionalizeget_mutable_argsr   r   ro   )ab
mutable_opmutable_arg_namesr  r   r  s          r<   !can_fuse_into_auto_functionalizedz8solve_min_cut.<locals>.can_fuse_into_auto_functionalizedA  s    8uy-AAA5VAY
 #6GG
SS	
% 	  	 D(4.CCxxtt#t$$  8844ur>   c                     |j         t          j        j        j        k    rdS |j        d         }|D ]}|j        d         |         }| |u r dS dS )NFtensors_to_cloner   T)r   r   r  r   triton_kernel_wrapper_functionalr   )r  r  r  r   r  s        r<   .can_fuse_into_triton_kernel_wrapper_functionalzEsolve_min_cut.<locals>.can_fuse_into_triton_kernel_wrapper_functionalR  sd    8uy-NNN5H%78% 	 	D(8$T*CCxxtt ur>   c                 H   t          |          t          j        k    rdS  | |          rdS  | |          rdS | j        t          j        u r*| j        d         j        t          j        j	        j
        u rdS                     |           o                    |          S )NTr   F)r,   r  catr   r  r  r   r   r  r  r  r=   )r  r  r  r  op_typess     r<   r=   z!solve_min_cut.<locals>.is_fusible\  s     1))4,,Q22 	499!Q?? 	4H(((q	 y%FG G
 5""1%%@(*=*=a*@*@@r>   r   zANeed networkx installed to perform smart recomputation heuristicsc                 z                        |           rdS t          | g          }t          |          dk    r|                                }|j        D ]P}                    |          s ||          s dS                      |          r|                    |           Qt          |          dk    dS r  )rD   r   r   rd  rf  rg   r  )r7   r  currm  r=   r  r  s       r<   is_materialized_backwardsz0solve_min_cut.<locals>.is_materialized_backwardsv  s    D!! 	5v&&	)nnq  --//C	 ( ( //55  jjd>S>S  44##D)) (MM$''' )nnq   ur>   c                    | j         dk    rdS | j        t          j        k    rdS | j                            dd           t          j        k    rdS t          j	        r
                    |           rdS | j        t          j        j        t          j        j        fv rdS j        r                    |           sdS n,                    |           s                    |           rdS j        r; |           r0t(                              d| t-          | j                             dS | j        dk     r| j        t          j        k    rdS j        r6t7          d | j        D                       }t;          |           }|dz  |k     S dS )	Nr   Fr}   Tzmaterialized backwards: %s %si  c              3   h   K   | ]-}t          |t          j                  t          |          V  .d S r9   )r   rL   rM   r  r  s     r<   r]   zBsolve_min_cut.<locals>.should_ban_recomputation.<locals>.<genexpr>  sM       % % !*Q2H2H%% % % % % %r>   r   )r   r   r  r  r~   r   r   r~  r   recompute_viewsrD   r  lift_fresh_copyr  
lift_freshrz   rF   rB   r@   ry   r/   r  r   rf  dist_from_bwmax_dist_from_bwr{   r  r   r  )r7   input_tensors_sizeoutput_sizer  r  r  s      r<   should_ban_recomputationz/solve_min_cut.<locals>.should_ban_recomputation  s   7o%%5;(***59==d++/?/III4! 	h&6&6t&<&< 	5;4/79PQQQ52 	++D11 t !!$'' 8+H+H+N+N t 7 	<U<U=
 =
 	 II5tU4:=N=NOOO4 t##(9F<S(S(S4 + 	8!$ % %%)Y% % % " " #4..K?%777ur>   c                 d      j         dk    rdS t           fd j        D                        S )Nr   Tc              3   0   K   | ]} |          V  d S r9   rN   )r[   rm  r=   r7   s     r<   r]   z9solve_min_cut.<locals>.is_materialized.<locals>.<genexpr>  s/      EE$zz$--EEEEEEr>   )r   r  rf  )r7   r=   s   `r<   is_materializedz&solve_min_cut.<locals>.is_materialized  sA    7m##4EEEEE$*EEEEEEEr>   rW   c           
         t           j        r| |v rdS t          |           }t           j        r!                    |           rt
          j        S t          | j        d         t                    r,t          | j        d         t          j                  st          S t          |dt          t          | j        d          d          z  z            } |           r|S |dz  S )Nr   r   g?rA  r       )r    treat_parameters_as_free_to_saver  r  rD   r  r  r   r~   r   r   r   INT_INFrq   r  r  r  )r7   rV   mem_szr  r  s      r<   get_node_weightz&solve_min_cut.<locals>.get_node_weight  s    3	3331$! 	h&6&6t&<&< 	 8Odi&55 	di.==  Vsc#d.?*E*Eq&I&IIJKK?4   	MA:r>   c                                         |           rdS | v rDt          | j        t          j        j                  o| j        j        dk    }t          j        s|sdS t          |           rdS d| j
        v r't          | j
        d         t          j                  rdS                     |                                d| j        dz   t          j                   dS )NFr  r   source_incapacityT)rD   r   r   r   r  r|  r}  r   (unsafe_allow_optimization_of_collectivesr   r~   r   r  add_edger   r  r  )r7   is_collectivebanned_nodesr  nx_graphr  s     r<   ban_recomputation_if_allowedz3solve_min_cut.<locals>.ban_recomputation_if_allowed  s    D!! 	58 4;
(=>> @K)-??  > m u $ 	5DI*TYu-=u~"N"N5
 	(DI$5IIItr>   r   r  sinkr  _outr   r
          start_nodes	max_rangec                    g }| D ]-}t          j        |
                    |          |df           .t          |          dk    rt          j        |          \  }}}|s
                    |          S |j        D ]l}
                    |          rU
                    |          |k    r1
                    |          | 	||          f}||vrt          j        ||           mt          |          dk    |S )z
        Finds the first unfusible node in the chain of nodes starting from
        `start_nodes` and returns its position.
        Tr   )heapqheappushrn   r   heappoprf  rg   )r  r  sorted_nodesr\   r  r7   node_is_fusiblerm  r   r=   r  s            r<   find_first_unfusiblez+solve_min_cut.<locals>.find_first_unfusible>  s4   
 9; 	O 	OAN<)*@*@*C*CQ)MNNNN,!##',}\'B'B$At_" 4 --d333
 
: 
:++D11 	: --d33i?? !..t44"
4..6C
 ,..|S999 ,!## r>   c                 d    g | ],}                     |                              |          -S rN   )rg   rn   r[   rm  r  s     r<   r   z!solve_min_cut.<locals>.<listcomp>Z  sK       ++D11&&t,,  r>   c                 >    g | ]}                     |          |S rN   )rg   r  s     r<   r   z!solve_min_cut.<locals>.<listcomp>_  s<       I4L4LT4R4R  r>   z1used above/below fusible %s:(%s) -> %s -> %s:(%s)rA  ztoo long %s %s %s %sr  z-Failed to compute min-cut on following graph:
c              3   ,   K   | ]}||         fV  d S r9   rN   )r[   r\   r  s     r<   r]   z solve_min_cut.<locals>.<genexpr>  s,      88Q$888888r>   c              3   (   K   | ]}|v |fV  d S r9   rN   )r[   vnon_reachableus     r<   r]   z solve_min_cut.<locals>.<genexpr>  s1      AAa=.@.@q!f.@.@.@.@AAr>   c                     i | ]\  }}||	S rN   rN   r  s      r<   r  z!solve_min_cut.<locals>.<dictcomp>  s    HHHic4cHHHr>   c              3   (   K   | ]}|         V  d S r9   rN   r[   r7   name_to_nodes     r<   r]   z solve_min_cut.<locals>.<genexpr>  s(      22d	222222r>   c                     |          S r9   rN   )r   node_idxs    r<   r_   zsolve_min_cut.<locals>.<lambda>  s    (1+ r>   r`   );r   get_default_op_listr.   r   r6   r/   rJ  networkxImportErrorr   floatDiGraphr   rS   rQ   r  r   r  r  r   r   r   rg   r~   r   r   r  r   r   r   r   rV   rf  ro   rL   rM   rq   rw   rd   r   r  r   rn   rx   r  r  r  r  minimum_cut	Exceptionjoin	readwriteedgelistgenerate_edgelistvisualize_min_cut_graphr  get_name_to_noder  rb   )0r   r  r  r  joint_module_opsops_ignorednxer  r  r  r7   is_non_tensor_nodeweightrm  r  	used_nodeordersfw_usersfirst_unfusible_usevisited
start_nodefusiblestart_orderr  r  	cut_value	partition	reachablecutsetnbrs	cut_nodesnode_innode_out	node_namer   r  r  r  r=   r  r  r  r  r  r  r  r  s0    ```                                @@@@@@@@@@@@r<   solve_min_cutr  ,  s	    <<"$$H 	H% &
 &
#)&
 &
 &
 
 

 ' 4
 4
$54
 4
 4
 *
 *
 
 	9;GGG  "  A A A A A A A&   O
 
	
      0 0 0 0 0 0 0dF F F F Fe       < zz||H(2L       6 ! 3X 3X7h9...9+++!!$)e"3Vdh!OOO di&0&48LLL$ 	
 di%/$(KKKd 	/2488 	/((...
 ##D)) 	/.F.Ft.L.L 	/((... "E}DI'EUty SDIe4Del)S)S%S 	 t 	R=..//FF 	R!$)--"6"6FFTDH F %_T9+PQQF$)e+TY-?&QQQJ 	X 	XDdi&0$)e2CdhWWWW	X($rw- C C       4 , ;"4 	; 	;I   %O  F
   !*  H 6{{Q&:&:8S[[&Q&Q#!)/22 ; ;D!0066;%22488;NNN&Jy$77 O  <//$O%%229==/ %22488   54T::: 1 #V'1||%+ !	V !	VJ++J77 ''
33Z@2G $00<<Kg,,""w//3'>>C    **3//+2CCCG))HH."!..s33!..z::   10555I V VD!0066V&JsD11V !44w1G1G1M1Mt0TUUU5 g,,""8!~~h&II	99   @AAA2<0BB8LLMMNNN)))	  )I}*4,,F8888i888 B B4AAAAAdAAAAAAA!+I# ! !ss|x},,,,CRCL	i    #K00LHH9[5F+G+GHHHH2222	2228M8M8M8M  L %%s%   B" "
C,B<<C	X$ $A?Z#c                 4   dd l }dd l}|j                            |                                           }|                    |          d         }|                                D ]}| |                                         |                                         d         }|	                    t          |                     |t          d          k    r|                    d           t                              d           |                    d           d S )Nr   r  r  redz2Visualizing the failed graph to min_cut_failed.svgzmin_cut_failed.svg)r  pydotnx_pydotto_pydot	to_stringgraph_from_dot_data	get_edges
get_sourceget_destination	set_labelr   r  	set_colorr/   rJ  	write_svg)r  r  r  
dot_format	dot_graphedger  s          r<   r  r    s    LLL%%h//99;;J))*55a8I##%% " "$//++,T-A-A-C-CDZPs6{{###U5\\!!NN5!!!HHABBB,-----r>   c                  .   g t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j	        t           j
        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j         t           j!        t           j"        t           j#        t           j$        t           j%        t           j&        t           j'        t           j(        t           j)        t           j*        t           j+        t           j,        t           j-        t           j.        t           j/        t           j0        t           j1        t           j2        t           j3        t           j4        t           j5        t           j6        t           j7        t           j8        t           j9        t           j:        t           j;        t           j<        t           j=        t           j>        t           j?        t           j@        t           jA        t           jB        t           jC        t           jD        t           jE        t           jF        t          jH        t           jI        t           jJ        t           jK        t           jL        } t           jI        t           jJ        t           jM        g}|t           jN        t           jO        t           jP        t          jR        t           jS        t           jT        t           jU        t           jV        t           jW        g	z  }|}| g t          j        t          jX        t           jY        t           jL        t           jZ        t          j[        t          j@        t           j[        t           j\        t          jR        t           jV        t           j]        t           jN        t           jS        t           jO        t           j^        t           j_        t           j`        t           ja        t           jb        t           jc        t           jd        t           je        t           jf        t           jg        t           jh        t           ji        t           jT        t           jj        t           jk        t           jl        t           jm        t           jn        t          jo        t          jp        z  } | t           jq        t           jr        gz  } | |z  } | t                      z  } | t           jt        gz  } | d t          D             z  } t          |           }t          t          dt          f                  t           jy        t           jz        t           j{        g          }t           j|        t           j}        t           j~        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        g}||z  }t          |t          |          |t          |          |          S )Nc                 ,    g | ]}t          |          S rN   )r   )r[   ms     r<   r   z'get_default_op_list.<locals>.<listcomp>]  s!     N N N1!3A!6!6 N N Nr>   .)r  r  subre  atan2r  r  r  pow	remainderfmod__and____or____xor__
__lshift__
__rshift__eqnegegtleltr  bitwise_notceilfloorfracnegreluroundsilutruncr/   log10log1plog2lgammaexpexpm1erferfccosacoscoshsinasinsinhtanatantanhatanhsqrtrsqrtr  sigmoidsoftplus	thresholdthreshold_backwardclampwherelerpaddcmulgelugelu_backwardr  mean_grad_sum_to_sizesum_to_sizer  totype_asr  r  squeeze	unsqueezersub_to_copyaliasviewslicetr  broadcast_in_dimexpand
as_stridedpermuteselectr9  r  clone	full_likevarstd_unsafe_viewreshapebroadcast_tensorsscalar_tensorones	new_zerosr  arangetriuvar_meanisinfr   fullzerosempty
empty_likeargmaxmaximumiota'_low_memory_max_pool_offsets_to_indicesr4  gatherr	  
zeros_liker   r   r   r   native_dropout	rand_like
randn_likemmconvolutionconvolution_backwardbmmaddmm#_scaled_dot_product_flash_attention'_scaled_dot_product_efficient_attention_flash_attention_forward_efficient_attention_forwardupsample_bilinear2d
_scaled_mmr1   )default_recomputable_opsrecomputable_view_opsr5   r6   r4   r3   r2   s          r<   r  r    ss   L0L0L0 	L0 	
	L0
 	L0 	L0 	L0 	L0 	L0 		L0 	L0 	L0 	L0 	L0 	L0  	!L0" 	#L0$ 	%L0& 	'L0( 	)L0* 	+L0, 	-L0. 	/L00 		1L02 	
3L04 		5L06 	7L08 		9L0: 	
;L0< 		=L0> 	
?L0@ 	AL0B 	
CL0D 	
EL0F 		GL0H 	IL0J 	KL0L 	
ML0N 	OL0P 		QL0R 	SL0T 		UL0V 		WL0X 	YL0Z 		[L0\ 		]L0^ 	_L0` 		aL0b 		cL0d 	
eL0f 		gL0h 	
iL0j 	kL0l 	mL0n 	oL0p 	qL0r 	sL0t 	
uL0v 	
wL0x 		yL0z 	{L0| 		}L0~ 	L0@ 	AL0B 		CL0D 	EL0F 	GL0H 		IL0J 	KL0L 	ML0N 	OL0P 	QL0R 	SL0T 		UL0V 	WL0Z "\4>4:F	


 
 %H $!	$!"$! 	
$! 		$!
 	$! 		$! 		$! 	$! 	$! 	$! 	$! 	$! 		$! 	$! 	
$!  	!$!" 	#$!$ 	%$!& 		'$!( 	)$!* 	+$!, 	-$!. 		/$!0 	1$!2 	
3$!4 	5$!6 		7$!8 	9$!: 	
;$!< 	
=$!> 	?$!@ 	A$!B 	C$!D 	
E$!F 	5G$! $L T[ 99(/!   N N N N NN!":;;HS#X./		dndo> J 	!
04%)  #Z/K())8  r>   c                 2    i }| j         D ]}|||j        <   |S r9   )r   r   )r   r  r7   s      r<   r  r  {  s-    L ' '"&TYr>   memoryruntimes
max_memoryall_recomputable_banned_nodesc                    t           j        }|dk    rt          |||          S |dk    rt          |||          S |dk    rt	          |||          S |dk    rkt
                              d           t          j        | |||          }t	          ||t          |          
                    t          |                    S t          |          r ||| |||          \  }}	d	||	fS t          d
|           )Ngreedyilpdpdynamic_memory_budget_dpzdynamic_memory_budget_dp is an experimental solver. It does not guarantee performance improvements. Additionally, it is not guaranteed to be stable.)r   r   recorded_knapsack_input_memories recorded_knapsack_input_runtimes)graph_info_provider)knapsack_algomax_mem_budgetr  z,Not aware of memory budget knapsack solver: )r   activation_memory_budget_solverr#   r$   r"   r/   warningr!   inialize_from_graphr%   get_knee_point_memory_budgetcallabler   )
r   r  r  r  r  r  SOLVERr  saved_node_idxrecomp_node_idxs
             r<   #_optimize_runtime_with_given_memoryr    sQ    3Fvx<<<	5FHj999	468Z888	-	-	-?	
 	
 	

 0C#*G-3-5	
 
 
 $7  **)) +  	
 	
 		
 
&		 T*0&KY8U+
 +
' ^_55R&RRSSSr>   no_dispatchr   r  c                     t          | j                  }fdfd|D             }fd|                                 D             }|                     ||          S )Nc                 &    t          |           S )Nr  )r   )dr  s    r<   realize_symbolz8_remove_symbols_without_guarding.<locals>.realize_symbol  s    H----r>   c                 &    g | ]} |          S rN   rN   r[   r  r  s     r<   r   z4_remove_symbols_without_guarding.<locals>.<listcomp>  s#    ...1^^A...r>   c                 &    g | ]} |          S rN   rN   r  s     r<   r   z4_remove_symbols_without_guarding.<locals>.<listcomp>  s#    444AnnQ444r>   )stride)ro   shaper  new_empty_strided)r   r  r  r  r  s    `  @r<    _remove_symbols_without_guardingr    s    MME. . . . . /......E4444444FuV444r>   c                 L   	 t           j        }d }|dk    rdS |dk    rnt                      5  ddlm} t          j        | j         j        f          \  	|	                    	 fd          }|cd d d            S # 1 swxY w Y   d S |dk    rdd	l
m} t          j        | j         j        f          \  	 |d
          5 }  j        i 	 d d d            n# 1 swxY w Y   |                                }t          |d          S t          d|           )Nc                 v   t          | t          j                  rAt          | j        d         t          j                  rt          | j        d         d          S t          | t          j                  rAt          | j        d         t          j                  rt          | j        d         d          S t          | t          j                  r't          | j        d         t          j	                  rdS t          | t          j                  r't          | j        d         t          j
                  rdS | S )Nr   r  r        ?T)r   rL   rM   r~   r   r  r  r   r   r   r   r  s    r<   materialize_argz)estimate_runtime.<locals>.materialize_arg  s    a!! 		j&M&M 		3AF5MDQQQQ27## 	
16%=%,(O(O 	AF5MD999927## 	
16%=%.(Q(Q 	327## 	
16%=%-(P(P 	4Hr>   testingr    profiler   )benchmarkerc                       j          i S r9   )r   )r   r   r7   s   r<   r_   z"estimate_runtime.<locals>.<lambda>  s    ;4;3O3O3O r>   flops)FlopCounterModeF)displayz Not aware of runtime estimator: )r   *activation_memory_budget_runtime_estimatorr  $torch._inductor.runtime.benchmarkingr  r   tree_mapr   r   benchmark_gputorch.utils.flop_counterr  r   get_total_flopsr  r   )
r7   RUNTIME_MODEr  r  msr  modecounted_flopsr   r   s
   `       @@r<   estimate_runtimer    s   DL
 
 
 y  q		"	"]] 	 	HHHHHH!??TY<TUULD&**+O+O+O+O+O+OPPB	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 
	 	 <<<<<<DK8PQQf_U+++ 	)tDK((((	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	),,..=!$$$LlLLMMMs$   ABB	BC%%C),C)c                     !"#$%&'( |dk    s|dk     rt          d|           t          t          j        t          j        t          j        t          j        t          j                  }t          j        rt          |dddd          }|dk    rj
        S t           |          \  }}|dk    r|S dt          t          j                 dt          fd	 j
                  % |          ##%k    r|S #%fd
 dt          t          j                 f#%fdt          |ddd          }t           |          \  }} |          |k     r|S t          |d          t                     \  }}	 |          |k     r|S ddlm t%          fdj
        D                       "dt$          t          j                 dt          t          j                 f"fd}
 |
|	          }d |D             &&fd|D             }t'          |t(          d          t+                    dk    r
j
        &z   S  fdD             $d D             (ddlm' $'(fd!t          j        r<! (fd} |d           |d          g}|d         dd          |d         dd          k    r|d         |d         fg}|r|                                \  }}|d         |d         z
  dk     r+|                    |           |                    |           Y ||d         |d         z   dz            }|dd          |dd          k    r|                    ||f           |dd          |dd          k    r|                    ||f           ||                                 dd lm} d  |D             }d! |D             }|                    d"#           |                    ||d$%           tA          |          D ])\  }}|!                    |d&|||         fd'd(d)*           *|"                    d+           |#                    d,           |$                    d-           |%                    d           |&                                }|'                                 tQ          j)                    }t          j*        "t          j*        }tQ          j+        |d.           d/}tX          j-        .                                r?tX          j-        /                                r!d0tX          j-        0                                 }tP          j1        2                    |d1| d2tg                       d3          }|4                    |           tj          6                    d4|            !| 5          d         S )6Nr    r   zJThe valid ranges for memory budget are 0 <= m <= 1. The provided value is )rw   rx   ry   rz   r{   F)rw   rx   ry   rz   r   rW   c                 L    t          t          t          |                     dz  S N    eA)r  mapr  )r   s    r<   estimate_activations_sizez:choose_saved_values_set.<locals>.estimate_activations_size	  s    3x..//#55r>   c                     | dz  z
  z  S r  rN   )szmax_act_sizemin_act_sizes    r<   get_normalized_sizez4choose_saved_values_set.<locals>.get_normalized_size	  s    S\L899r>   activationsc                 ,     |           z
  z
  z  S r9   rN   )r  r  r  r  s    r<   get_mem_ratioz.choose_saved_values_set.<locals>.get_mem_ratio	  s(    ))+66E<'
 	
r>   )rw   rx   ry   )rz   )get_node_storagec              3   .   K   | ]} |          V  d S r9   rN   )r[   r7   r  s     r<   r]   z*choose_saved_values_set.<locals>.<genexpr>5	  s/      TT4 0 0 6 6TTTTTTr>   r  c                 "    fd| D             S )Nc                 ^    g | ])}|j         t          d           k     r |          v'|*S )r  )r  rq   )r[   r  r  input_storagess     r<   r   zRchoose_saved_values_set.<locals>.get_recomputable_banned_nodes.<locals>.<listcomp>:	  sP     
 
 
 S))$$Q''~== 
 >==r>   rN   )r  r  r  s    r<   get_recomputable_banned_nodesz>choose_saved_values_set.<locals>.get_recomputable_banned_nodes7	  s3    
 
 
 
 
!
 
 
 	
r>   c                 d    g | ]-}|j                             d d          t          j        k    +|.S )r}   F)r~   r   r   r~  r  s     r<   r   z+choose_saved_values_set.<locals>.<listcomp>E	  sA       6::k5))-=-GGG 	
GGGr>   c                     g | ]}|v|	S rN   rN   )r[   r  must_save_nodess     r<   r   z+choose_saved_values_set.<locals>.<listcomp>J	  s*     ! ! !0H0H0H0H0Hr>   Tr  c                 @    g | ]} t          |                    S rN   r  )r[   r  r  s     r<   r   z+choose_saved_values_set.<locals>.<listcomp>W	  s8       -.HQKK((  r>   c                 ,    g | ]}t          |          S rN   )r  r   s     r<   r   z+choose_saved_values_set.<locals>.<listcomp>Z	  s.       #'  r>   r  c           
                      5  t          |t          | d          |          \  }}}d d d            n# 1 swxY w Y   t                      }|D ].}	 |                    |                    # t          $ r Y +w xY w|                              sJ t          ||
|          \  }}	t          rt          |||||           ||fS )Nr   )r   r  saved_node_idxsrecomputable_node_idxsexpected_runtimememories_banned_nodesruntimes_banned_nodesmin_cut_saved_values)	r  r  r   r  BaseExceptionissubsetr  r.   r   )memory_budgetr  r   r  r  r  r  r  r   r  aggressive_optionsr  r  r  r  s             r<   get_saved_values_knapsackz:choose_saved_values_set.<locals>.get_saved_values_knapsack_	  sz   []] 	 	
 4%%M1%%- 	 &		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 )3) 	 	C:3?@@@@       !>?????'	
 
a ! 
	4'.K /'=!1&;&;%1	 	 	 	 ---s!   '?AAA99
BBc                 b     |           \  }}| t                    |z
   |          fS )N)r  r   )r  )r  r   r  r  r  r   r  r  s      r<   estimate_for_budgetz4choose_saved_values_set.<locals>.estimate_for_budget	  sU    -F-FYK. . .*L* )**-==l++ r>   r  r  gMbP?r  c                     g | ]
}|d          S )r  rN   r[   items     r<   r   z+choose_saved_values_set.<locals>.<listcomp>	      000DG000r>   c                     g | ]
}|d          S r    rN   r  s     r<   r   z+choose_saved_values_set.<locals>.<listcomp>	  r  r>   )
      )figsizeo)markerz.4fzoffset points)r   r  center)
textcoordsxytexthazMemory Budgetz Runtime of Recomputed Componentsz:Pareto Frontier of Memory Budget vs. Recomputation Runtime)exist_okr]  _rank_memory_budget_paretor  z.svgz%Generated Pareto frontier curve at %s)r  r  r   )7r   rv   r   ban_recompute_used_far_apart!ban_recompute_long_fusible_chains#ban_recompute_materialized_backwardban_recompute_not_in_allowlistban_recompute_reductionsaggressive_recomputationr   rQ   r  ro   rL   rM   r  torch._inductor.fx_utilsr  r   rb   r  r   torch.utils._mode_utilsr  visualize_memory_budget_paretord  r   sortmatplotlib.pyplotpyplotfigureplotr  annotatexlabelylabeltitlegridgcfshowosgetcwdmemory_budget_pareto_dirmakedirsr   r  r  is_initializedget_rankpathr  r(   savefigr/   r  ))r   r  r  r  runtime_optimized_saved_valuesr  more_aggressive_optionsmore_aggressive_saved_values%aggressive_recomputation_saved_valuesr  r  recomputable_banned_nodesr  optionsbisectslhsrhsmidpltx_valuesy_valuesr  txtfigfig_dirrank_suffixfig_namer  r  r  r  r  r  r  r  r  r  r  r  r  r  s)   ``                         @@@@@@@@@@@@@@r<   choose_saved_values_setr.    s   
 qMA--hYfhh
 
 	
 $$A#)#K%+%O & E8  O & 
!"'',).$)
 
 
 (5) )%"A --6RW 6% 6 6 6 6 -,Y-=>>L,,-KLLL|##--: : : : : :
4= 
 
 
 
 
 
 
 

 &##(%*	   '4Y 7' '# ! }122]BB++  %   ;HY 2; ;7)< }:;;mKK44999999TTTT9CSTTTTTN
 )
	bg
 
 
 
 
 
 
 !> =l K K *  O
! ! ! !,! ! ! %+!x% % %! ())Q../11   2O   +H   433333). ). ). ). ). ). ). ). ).V , AG	 	 	 	 	 	 	 	 	 '&s++-@-@-E-EF1:abb>WQZ^++
GAJ/0G 
/";;==Sq6CF?T))NN3'''NN3'''))3q6CF?a*?@@qrr7c!""g%%NNC:...qrr7c!""g%%NNC:...  
/ 	''''''0000000000 	

7
###8C000  )) 	 	FAsLLhqk"*      	

?###

5666		NOOOggii


)++*65GK$////))++ 	B0A0P0P0R0R 	BA5#4#=#=#?#?AAK7<<TKTT:L:N:NTTT
 
 	H;XFFF %$#yk  	 	r>   c                 T   ddl m d }fd}t          j                                        rwt          j                                        rXt          j                                        dk    r5 ||           r) ||           rt                      5               5  d |D             g}d t          t          j                                                  D             }t          j        	                    ||d                    t          |           g }i }t          |          D ]t\  }}	fd|	D             }
d}|
D ]B}t          |          }||z  }|t          j                                        k    r
|||j        <   C||d	<   |                    |           ut          j        |t          j        j                                        
          }t          j                            |t          j        j        j        j                   t-          t          j        |                                                    }d| d| t3          dd fd           fd||         D             }d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |S )Nr   )unset_fake_temporarilyc                     | j         D ]7}t          |j        t          j        j                  r|j        j        dv r dS 8dS )N>   c10d_functionalr  TF)r   r   r   r   r  r|  r}  )r   r7   s     r<   has_collectivesz3_sync_decision_cross_ranks.<locals>.has_collectives	  sP    % 	 	DUZ2  +'+RRRttur>   c                 0   d                     d | j        D                       }t          j        |                    d                                                    }d t          t          j        	                                          D             t                      5               5  t          j                            |           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   t          fdD                       S )N/c              3   $   K   | ]}|j         V  d S r9   r_  r[   r   s     r<   r]   zE_sync_decision_cross_ranks.<locals>.has_same_nodes.<locals>.<genexpr>	  s$      >>qAF>>>>>>r>   zutf-8c                     g | ]}d S r9   rN   r[   r  s     r<   r   zF_sync_decision_cross_ranks.<locals>.has_same_nodes.<locals>.<listcomp>	  s    NNNqdNNNr>   c              3   0   K   | ]}d          |k    V  dS r  rN   )r[   r   
all_inputss     r<   r]   zE_sync_decision_cross_ranks.<locals>.has_same_nodes.<locals>.<genexpr>	  s,      ::!:a=A%::::::r>   )r  r   hashlibsha256encode	hexdigestr  r   r  get_world_sizer  all_gather_objectr  )r   node_strrQ   r;  r0  s      @r<   has_same_nodesz2_sync_decision_cross_ranks.<locals>.has_same_nodes	  s   
 88>>K,=>>>>> 8 899CCEENNE%*;*J*J*L*L$M$MNNN
]] 	D 	D2244 	D 	D//
FCCC	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D ::::z::::::s6   #C2.!CC2C	C2"C	#C22C69C6r    c                     g | ]	}|j         
S rN   r_  r7  s     r<   r   z._sync_decision_cross_ranks.<locals>.<listcomp>	  s    5551555r>   c                     g | ]}g S rN   rN   r9  s     r<   r   z._sync_decision_cross_ranks.<locals>.<listcomp>	  s%     : : :: : :r>   c                      g | ]
}|         S rN   rN   )r[   op_namer  s     r<   r   z._sync_decision_cross_ranks.<locals>.<listcomp>
  s    TTT|G4TTTr>   z
total size)r+  r   zpicked_rank_idx=z, saved_nodes of current rank=rs  c                      dddS )N)aot_joint_graph_sync_decision_cross_ranksrv  rw  rN   rN   r>   r<   r_   z,_sync_decision_cross_ranks.<locals>.<lambda>
  s    G (% % r>   c                       S r9   rN   )sync_decision_cross_ranks_strs   r<   r_   z,_sync_decision_cross_ranks.<locals>.<lambda>
  s    #@ r>   r  c                      g | ]
}|         S rN   rN   )r[   r\   r  s     r<   r   z._sync_decision_cross_ranks.<locals>.<listcomp>"
  s*       $%Q  r>   )torch._subclasses.fake_tensorr0  r   r  r  r  r@  r  r  rA  r  r  r  r  r   r   r-  distributed_c10d_get_object_coll_device
all_reduceReduceOpMAXrq   argminr  r   )r   r   r3  rC  objectssaved_ops_names_all_rankssaved_sizessaved_ops_with_sizesr  saved_ops_namessaved_nodes
saved_sizer7   size_of_nodesaved_sizes_tensorpicked_rank_idxr  rK  r0  s                   @@@r<   _sync_decision_cross_ranksr^  	  s    EDDDDD  ; ; ; ; ; 	&&((1,,..1 ,,..22OK(( 3N;'' 3 ]] *	 *	2244 *	 *	555556G: :!%"3"B"B"D"DEE: : :% //0I7ST:VVV+K88L%'K35 (12K(L(L 	/ 	/$_TTTTOTTT
' G GD#+D>>L,.Je/88:::::F,TY75?$\2"":....!&(9QQSS" " " (("u'8'I'R'V )    "%,/A"B"B"G"G"I"IJJO -E  -E  -E  oC  -E  -E)  A@@@      )B?)S  LQ*	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	X s7   J$GJ:JJ
	
JJ
	JJ!$J!c                    d}|rdnd}t          t          | j                            d                              }| j                            dt          j        j        j                  D ]}t          | |j	        d         j
                  }t          |t          j                  rg }t          |j                            d                    D ]\  }}	||	j        v r| j                            |          5  | j                            | d|           }
|d	z  }|	j        d
         |
j        d
<   |
}|                    |
           ddd           n# 1 swxY w Y   |r| j                            |          5  | j                            dt          j        j        j        g |j	        |R i           }|                    |d           ddd           n# 1 swxY w Y   |j                            d          }|r"|\  }}g |d |D             R }||f|j        d<   | j                            |           | S )u  
    Graph-safe RNG lets torch.compile use CUDA Graphs for graphs with RNG ops.
    For graphs without HOPs, the partitioner adds placeholder nodes
    fwd_rng_state_* and bw_rng_state_* to the forward and backward graphs. At
    runtime, the AOTDispatcher retrieves these RNG states and passes them to the
    compiled graphs.

    This works well for no-HOP graphs. With HOPs, the partitioner runs
    recursively: it first partitions the HOP (producing forward/backward HOP
    subgraphs) and then stitches them back into the outer joint graph. For HOPs
    that contain RNG ops, the outer joint graph now includes HOP subgraph
    modules with extra RNG placeholders. We must thread these placeholders
    through the outer module partitioned forward and backward graphs—this
    function does exactly that. It collects the RNG placeholder nodes from the
    HOPs and creates corresponding placeholders in the outer forward and
    backward graphs.

    There is a catch: for a short period, the joint graph is in a “bad” state.
    The HOP subgraphs expect additional inputs (because of the new
    placeholders), but the outer graph call sites don't yet provide them. We
    can't fix this in the joint graph because the joint graph's input signature
    is fixed (primals, tangents). As a compromise, we keep the joint graph in
    somewhat of a bad state for some time and, once the outer forward and
    backward graphs are partitioned, insert the corresponding RNG placeholders
    and wire up the calls.
    r   r>  r=  r   r   r   )r   r   r  r    r   NT)propagate_metaeager_input_valsc                 (    g | ]}|j         d          S )r   )r~   )r[   inps     r<   r   z2thread_graphsafe_rng_from_hops.<locals>.<listcomp>p
  s    DDDc#(5/DDDr>   )r   r  r   r   r   r  r  invoke_subgraphr8  r   r   r   rL   r!  r  r   r  r   r~   r   r8  r  r   r  )moduler   r,  
rng_string
last_inputhop_noder   new_rng_inputsr  placeholder_noder2  new_hop_node_with_fixed_args
eager_vals
eager_argseager_kwargsnew_eager_argss                   r<   thread_graphsafe_rng_from_hopsrp  )
  s   8 I$/D_Jhv|66-6HHIIJJJL++59#9#I ,   .2 .2 68=#3#:;;h// *	2N)2))]);;* * 9 9%% !1!666  55jAA 9 9$*L$<$<)77I77% %	 "Q	0@0Ee0L	u-%.
&--i8889 9 9 9 9 9 9 9 9 9 9 9 9 9 9  2\11(;; 	 	39<3K3K'	.>9(-9.99	4 40 224T 3   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &]../ABB
 	/9,J&#&DD^DDD& &N
 '$M056HI ''111Ms&   >AEE"EAG""G&	)G&	rQ  )r  c                P	   | j                                          |                                  | j         }t          j        rt          |          }|| _         | j         }t          |           }t          |           }	|rt          |           } t          j	        st          |            t          |            fd}
|g } |
| |          }t          |j                  dk    rt          | |||j                  S t!          | j         j                  D ]}|j        dk    rt'          d          |_        "|                    |          sd|_        ?t'          d          |_        |j        D ]$}t/          |j        |j        dz             |_        %t          j        }|j        D ]?}t3          |j                            dd          t8                    r|j        d         } n@t;          |||	          }t          j        rt=          ||          }t?          tA          tB          |                    }t?          tA          d
 |                    }tE          | |||j                  \  }}|r$|	r"tG          | ||t          |                    \  }}tI          |          }tK          |          }tK          |          }tM          |d          }tM          |d          }tN          rtQ          d |D                       }tS          d |D                       dz  }tT          +                    d|           tT          +                    d|           tY          d |j         j        D                       }tY          d |j         j        D                       }||z  }t[          t&                    }|j         j        D ]G}|j.        |v r<t_          |j0        d          r'|tc          |j0        j2                  xx         dz  cc<   HtT          +                    dt          |          t          |          t          |                     tQ          |3                                ti          j5        d          d          }tT          +                    d|           ||fS )ax  
    Partitions the joint graph such that the backward recomputes the forward.
    Recomputing helps in trading off memory bandwidth with computation.

    To create the fwd and bwd graph, we copy the joint graph, manually set the
    outputs to just original forward or backward outputs. And then we run the
    resulting graphs through dead code elimination.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.
        _joint_inputs: The inputs to the joint graph. This is unused.
        compiler: This option determines the default set of recomputable ops.
            Currently, there are two options: ``nvfuser`` and ``inductor``.
        recomputable_ops: This is an optional set of recomputable ops. If this
            is not None, then this set of ops will be used instead of the
            default set of ops.
        num_fwd_outputs: The number of outputs from the forward graph.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    c                    t          | j                  t                      | j        j        D ]n}|j        dk    rd|j        v r                    |           n$t          |          r                    |           |v r                    |j	                   ot          t          t          | j        j                            }t          t          t          | j        j                            }||z   }t          |           \  }}}}	                    d |D                        t          | j        |||d          }
t          fd|
j        D                       t          fd| j        j        D                       }t          fdt!          |          D                       }d	}i }| j        j        D ]}|v r
|||<   |d
z  }t#          ||||          S )Nr   r   r  c              3   4   K   | ]}||j         dk    |V  d S )Nr   r   )r[   r  s     r<   r]   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>
  s;       !
 !
am8H8HA8H8H8H8H!
 !
r>   r   c              3   H   K   | ]}|j         d k    |j                 V  dS r  r  r  s     r<   r]   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>
  sC       <
 <
w("" #""""<
 <
r>   c              3   ,   K   | ]}|v|v
|V  d S r9   rN   )r[   r7   rS   rd   s     r<   r]   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>
  sF       :
 :
,,,=N1N1N 1N1N1N1N:
 :
r>   c              3   *   K   | ]\  }}|v 	|V  d S r9   rN   )r[   r  pr  s      r<   r]   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>
  s;       1
 1
!Qa;X6X6XA6X6X6X6X1
 1
r>   r   r    )r  r   r   r   r   r   r  r   r  rf  ro   r  r   r   r   r   r  rP   )r   r  r7   r  r  rQ   r   r   r   r   r  rT   rV   fw_cntrU   r  rS   rd   r   s    `             @@@r<   classify_nodesz;min_cut_rematerialization_partition.<locals>.classify_nodes
  s   '(:;;1; &, 	5 	5Dw-''J$+,E,E!%%d++++%d++ ,!%%d+++(((!((444VJ0B0HIIJJ!%&(:(@AA"
 "
 !77$\?SSS 	G["35F 	   !
 !
"!
 !
 !
 	
 	
 	
 @5F	
 
 2< <
 <
 <
 <
*0<
 <
 <
 2
 2

 0: :
 :
 :
 :
 :
$*0:
 :
 :
 0
 0

 '1 1
 1
 1
 1
#M221
 1
 1
 '
 '
#  &, 	 	D(((!'!'
 
 	
r>   Nr   )r   r  rV   r   r  r    r  )r  c                 "    t          |            S r9   r  )r\   s    r<   r_   z5min_cut_rematerialization_partition.<locals>.<lambda>  s    [^^); r>   r  F)r   Tc                 J    g | ] }t          |          t          |          f!S rN   )r  r   r  s     r<   r   z7min_cut_rematerialization_partition.<locals>.<listcomp>)  s)    KKKSVV4KKKr>   c              3   4   K   | ]}t          |          V  d S r9   r  r  s     r<   r]   z6min_cut_rematerialization_partition.<locals>.<genexpr>,  s(      'J'J'J'J'J'J'J'Jr>   z'Theoretical Activations Stored: %.2f GBz,Theoretical Per Activation Storage Sizes: %sc              3   :   K   | ]}|j         d k    |j        V  dS r   Nr  r   s     r<   r]   z6min_cut_rematerialization_partition.<locals>.<genexpr>1  9       %
 %
47o;U;UDI;U;U;U;U%
 %
r>   c              3   :   K   | ]}|j         d k    |j        V  dS r~  r  r   s     r<   r]   z6min_cut_rematerialization_partition.<locals>.<genexpr>4  r  r>   r  z# remat/fw/bw: %d/%d/%dr  zCount of Ops Rematerialized: %s)6r   r   rf  r   cser+   r   r   r  r  r  r  r   rS   r  rV   r  r   r   rq   r  rg   rf  r  activation_memory_budgetr   r~   r   r  r.  r^  ro   r  r   r  rz  r&  r-   rp  r.   rb   r  r/   rJ  r   r   r   r   r   r   r  r  r  r  )r   r  compilerr   r  r   	cse_graphr   graph_has_recomputable_opsgraph_has_recomputable_rng_opsry  r  r7   rm  r  r   r  r'  r(  sorted_sizestotal_activations_size_gbfw_module_nodesbw_module_nodesremat_nodescountsrematerialized_opss      `                      r<   r  r  {
  s*   D **,,,D z ' &&	&$K!5l!C!C%=l%K%K"! <-l;;: -|,,,|,,,4
 4
 4
 4
 4
l %,(*%|-JKKI
 9&''1,, +*G(1(M
 
 
 	
 +122 R R7h #CD))$// 	R !D #CD
 R R$'(94;Lq;P$Q$Q!!R 3M!  dimmOT::EBB 	 Io6ME	 +#  L
 ( M1+|LL6+|<<==O;;\JJKKL 4''$-$I  Iy " ) 	#8iC4H4H$ $ Iy 4I>>I y))Iy))I.yeLLLI.ydKKKI HKKlKKKLL %('J'J\'J'J'J$J$JS$P!:<UVVV 	?NNN$ %
 %
"+/"7%
 %
 %
 
 
 % %
 %
"+/"7%
 %
 %
 
 
 &7!,S!1!1O) 	> 	>DyK''GDKAR,S,S's4;677888A=888%    		
 	
 	
 $LLNN 3A 6 6
 
 
 	24FGGGir>   fx_graphTFtracedfnamefigname
clear_metaprogparse_stack_tracedot_graph_shapec                    |rDt          j        | j                  }t          j        | |          } | j        j        D ]	}i |_        
t          j        	                    |          \  }	}
|
sdt          j        z   }
t                              d|	|
           t          j        | |||          }|                                }t#          |d|
                    d          z             }|	 |
 }| ||           d S  |||           d S )Nr7  zWriting FX graph to file: %s%s)r  r  write_)r  )re  deepcopyr   rL   r!  r   r~   r  r  splitextr   torch_compile_graph_formatr/   rJ  r   FxGraphDrawerget_main_dot_graphr8  lstrip)r  r  r  r  r  r  r  r   r7   baseextgr   write_methods                 r<   
draw_graphr  J  s'     M&,//		22L& 	 	DDII  ''ID# 6F55HH-tS999"+'		 	 	A 	
A1hC899LNSNNE|UU&&&&&&r>   r9   )r  r  r  )rQ  )r  TNFN)re  rr   r<  r  r  loggingr  r  r  os.pathr  r   dataclassesr   r   typingr   r   r	   r
   r   r   torch._inductor.inductor_primstorch.distributedtorch.fxrL   torch.utils._pytreeutils_pytreer   torch._dynamo.utilsr   r   ;torch._functorch._activation_checkpointing.ac_logging_utilsr   torch._inductorr   r  torch._loggingr   rM  r   %torch.fx.experimental._backward_stater   "torch.fx.experimental.proxy_tensorr   r   torch.fx.experimental.sym_noder   r   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   r   torch.fx.passesr   torch.utils._ordered_setr   torch.utils.checkpointr   r]  -_activation_checkpointing.graph_info_providerr!   "_activation_checkpointing.knapsackr"   r#   r$   ,_activation_checkpointing.knapsack_evaluatorr%   _aot_autograd.descriptorsr&   r'   _aot_autograd.logging_utilsr(   _aot_autograd.utilsr)   r*   compile_utilsr+   r,   r-   sympydebug_partitionerr.   rt   rK   	getLoggerrG   r/   Loggerr  r  r  r1   rP   rv   rM   r   r!  r   r   rq   r   r   r   r   ro   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r"  r:  r,  r  r2  r>  rE  rG  rK  rY  rn  rp   r  r  r  r  r  r  r  r  cacher	  r  r&  r+  rC  rz  r  r  r  r  r  r  r  r  r  r  r  r  r.  r^  rp  r  r  rN   r>   r<   <module>r     s                 				  # # # # # # * * * * * * * * @ @ @ @ @ @ @ @ @ @ @ @ @ @  % % % %           $ $ $ $ $ $ $ $ $ < < < < < < < <      6 5 5 5 5 5 + + + + + + A A A A A A ? ? ? ? ? ? H H H H H H H H L L L L L L L L                ) ( ( ( ( ( / / / / / / 3 3 3 3 3 3       L L L L L L         
 L K K K K K L L L L L L L L ; ; ; ; ; ; M M M M M M M M H H H H H H H H H H  LLL %6 t 6 6 6'g'11W^ 1 1 1y~	 > > > > > > > >2                >         T    r~ $    2> d     C           
 o #J JJMJ "']J 	?	J
 smJ XJ J J JZRW     Gbg G$ G G G Gbg $    bg $    XRW X X X X XCrw C4 C C C CJbg J$ J J J JKrw K4 K K K K-bg -$ - - - -rw 4    J.J
4=$rw-i$y/IJJ J J J$$rw- s    d27mU27^;<    	J J8>J
(-J 
J 
	J J J JZB!8>B!
(-B! B! 	B!
 B! B! X]B! B! B! B!J9%, 95 9 9 9 9"D-    G%(- GD G G G G45 5 5 5 5	5; 	5 	 	 	 	8G%(. 8GT 8G 8G 8G 8GvMG%(. MGT MG MG MG MG`UUU CL)U 
	U U U Ux BF	,W ,Wrw-,W,W ,W "**RW*=!>	,W
 
,W ,W ,W ,Wj BFx" x" x".x"rw-x" "']x"
 x" "**RW*=!>x" 2>2>)*x" x" x" x"@ :>AE\ \ \.\
 $,DI#6\ "**RW*=!>\ 2>2>)*\ \ \ \~ #c(("# " " " " "27 s    :Rbh R R R R   "Pbgsl!3 PU27C<=P8Q P P P PJBN Jr~ J J J JZZ*x#Z*x#Z* X]Z* X]	Z*
 LZ* Z* HMZ* HMZ* Z* Z* Z*zR .R ~R  ~R  	R 
 2>2>)*R  R  R  R j@ @D @ @ @ @R^     .# #BN # # # #T /3	S& S&S&S& #S& z"'*+	S& S& S& S&l. . ."eW e e e ePBH    +T+TK+T 5k+T 	+T
 +T $(=+T 5$s)T#Y&'+T +T +T +T\ 0 / / / / /5 5 5 5 5 5 5$N $N $NT o	 o	o	o	 
"']	o	 o	 o	 o	dNN/3EHM/BN N N NbO O Oj L  :>L  L  L .L  $,DI#6L  2>2>)*L  L  L  L d ,0#%)' 'H '' ' 	'
 5d3i(
)' ' c]' 
' ' ' ' ' 'r>   