
    `i                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d	d
lmZmZ d	dlmZ erd	dlmZmZ d	dl m!Z! d	dl"m#Z#m$Z$m%Z%m&Z&m'Z' d	dl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ d	dl0m1Z1  ej2        e3          Z4ej5        6                    e3d          Z7erd dl8m9Z9 dLdZ:dMdZ;dMdZ<dMdZ=dMdZ>e G d d                      Z?dNd Z@dOd#ZAd$ ZBdPdQd'ZCd( ZDdRd*ZEd+ ZFdSd-ZGdTd/ZHdUd3ZIdVd5ZJe G d6 d7                      ZKdWd9ZLdMd:ZMdXd<ZNd= ZOd> ZPdMd?ZQdYdBZRdZdDZSdE ZTd[dKZUdS )\    )annotationsN)defaultdict)	dataclass)AnyOptionalTYPE_CHECKINGUnion)trace_structured)StorageWeakRef)
OrderedSet   )configir)WeakDep)IRNode	Operation)SchedulerBuffer)estimate_peak_memoryestimate_peak_memory_allocfreeFreeableInputBufferget_freeable_input_bufSNodeMemory)contains_collectivecontains_waitfind_recursive_deps_of_nodefind_recursive_users_of_nodeis_collectiveis_fallback_opis_wait)Voverlap)BaseSchedulerNodesnodeslist[BaseSchedulerNode]c                   i }| D ]}|                                 ||<   dd lm} ddlm} |                                } |            }d t          |          D             }|                    |t          |	                                          |           t          j        t          j        |          d          j	                                        }t          t          |                     D ]}	||	         | |	         _        d S )Nr   )_get_default_groupc                    g | ]}g S  r(   ).0_s     i/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/comms.py
<listcomp>zJalign_runtime_estimations_across_all_distributed_ranks.<locals>.<listcomp>B   s    6U6U6Uar6U6U6U    )dim)get_estimated_runtimetorch.distributeddistributed"torch.distributed.distributed_c10dr&   get_world_sizerangeall_gather_objectlistvaluestorchmediantensortolistlenoverride_estimated_runtime)
r#   runtime_estimationssnodedistr&   
world_sizepggathered_runtime_estimationsmedian_runtime_estimationsis
             r+   6align_runtime_estimations_across_all_distributed_ranksrF   7   s?     C C%*%@%@%B%BE""$$$$$$EEEEEE$$&&J				B6U6U5CTCT6U6U6U $d+>+E+E+G+G&H&H"   "'122" " "VVXX  3v;; M M/I!/Lq	,,M Mr-   returnc                (    t          | ddd          S )z7
    Greedily schedules waits as late as possible.
    FTraise_comms
sink_waitsreorder_for_overlap_schedule_for_commr#   s    r+   rK   rK   M   s$     Ed   r-   c                (    t          | ddd          S )z8
    Greedily schedules comms as early as possible.
    TFrI   rM   rO   s    r+   rJ   rJ   V   s$     DU   r-   c                (    t          | ddd          S )a  
    This achieves the following overall scheduling procedure:
        Step 1: Given that we've currently scheduled comm N, we now schedule all compute nodes
            that are required for comm N + 1 but do not depend on comm N, to run at the same time with comm N.
        Step 2: If all those compute nodes are sufficient to overlap comm N, we're done.
            Otherwise, we now need to look elsewhere to find compute that overlaps with comm N.
            We prioritize compute nodes that are needed sooner.
        Step 3: We schedule the compute nodes dependent on comm N and required for comm N + 1.
        Step 4: We schedule comm N + 1.
        Repeat this for subsequent comm nodes.
    TrI   rM   rO   s    r+   reorder_compute_for_overlaprR   _   s$     DTt   r-   c                *    t          |           \  }}|S )a  
    Reorders communication ops relative to computation ops to improve communication-compute overlapping and hide comm
    latency.  Stops moving a particular op if it reaches a point that would have increased the peak memory footprint.

    Currently, follows these heuristics (subject to change or tune):
    - never reorders collectives relative to one another, for SPMD safety
    - has an option for per-collective prefetch limit, but does not enable it by default
    - limits the total number of reorder steps to some factor of the graph size to prevent worst-case quadratic
      performance

    Prerequisite: sink_comms_and_waits - ensure comm and wait nodes are scheduled as late as possible, respecting data
    dependencies.  That allows reorder_communication_preserving_peak_memory to take a best case peak-memory snapshot,
    and then monotonically improve latency by moving collectives backward in time.

    Peak memory impact is computed in an iterative fashion.  First, memory use at each timestep is computed, and global
    peak memory is computed as a max over timesteps.  Then, when swapping any two adjacent nodes, only the curr-memory
    for the earlier of the nodes after the swap is affected.  This enables checking step by step whether a swap is
    peak-memory-safe, and bailing out if not.  Example:

    0   n0      C0
    1   n1      C0 + Allocs(n1) - Frees(n1)
    2   n2      C0 + Allocs(n1) - Frees(n1) + Allocs(n2) - Frees(n2)

    0   n0      C0
    1   n2      C0 + Allocs(n2) - Frees(n2)    <-- After moving n2 to Time 1, only time1 memory changes
    2   n1      C0 + Allocs(n2) - Frees(n2) + Allocs(n1) - Frees(n1)

    )6_reorder_communication_preserving_peak_memory_internal)r#   reordered_snodes
node_statss      r+   ,reorder_communication_preserving_peak_memoryrW   r   s"    @ 	?vFF !j r-   c                  ~    e Zd ZU dZdZded<   dZded<   dZded<   d	Zd
ed<   d	Z	d
ed<   dZ
ded<   ed             ZdS )ReorderInfozE
    Debug info describing how an individual snode was reordered
    floatinitial_exposedfinal_exposedNonestrlimiting_factorr   intmovesgrouped grouped_infoc                     | j         | j        z
  S N)r\   r]   )selfs    r+   improvementzReorderInfo.improvement   s    #d&888r-   N)__name__
__module____qualname____doc__r\   __annotations__r]   r`   rb   rc   re   propertyri   r(   r-   r+   rY   rY      s            OM!O!!!!ENNNNGL9 9 X9 9 9r-   rY   node"Optional[Union[IRNode, Operation]]boolc                    | dS t          | t          j        j        j        j                  rdS t          | dd           x}rd|v rdS dS )NFTpython_kernel_nameextern_kernels)r   r8   opsaten#_scaled_dot_product_flash_attentiondefaultgetattr)rp   rt   s     r+   is_gemm_liker{      sh    |u	:B   t &d,@$GGG
0
0
0t5r-   r?   r"   c                    ddl m} t          | |          rt          d | j        D                       S t          | j                  S )Nr   GroupedSchedulerNodec              3  4   K   | ]}t          |          V  d S rg   )contains_gemm_liker)   xs     r+   	<genexpr>z%contains_gemm_like.<locals>.<genexpr>   s+      ??Q%a((??????r-   )torch._inductor.schedulerr~   
isinstanceanyr#   r{   rp   )r?   r~   s     r+   r   r      sV    >>>>>>%-.. (??%,??????EJ'''r-   c                    ddl m} t          | |          r| j        r| j        D ]} ||           d S  ||            d S )Nr   r}   )r   r~   r   temp_groupingr#   )r?   fnr~   _snodes       r+   _temp_group_visit_leavesr      sn    >>>>>>%-.. 53F l 	 	FBvJJJJ	 	 	5					r-   Fr_   c                    d}| j         D ]G}|r|dz  }||                                z  }|r%|t          |                                            z  }H|S )Nrd   r*   )r#   get_namer6   get_buffer_names)r?   	with_bufsretns       r+   _group_namer      sj    
C\ 7 7 	3JCqzz|| 	7d511334466CJr-   c                :    t          | t                    o| j        S rg   )r   r   is_fake)ds    r+   _is_fake_depr      s    a!!/ai/r-   gnsc                @    d                     d | D                       S )N~c                6    g | ]}|                                 S r(   r   )r)   gns     r+   r,   z _group_names.<locals>.<listcomp>   s     111rR[[]]111r-   )join)r   s    r+   _group_namesr      s#    8811S111222r-   c                    t          | |          }t          | ||          \  }}}}t          t          | |                    }d|d<   |||||fS )z*Initialize memory tracking data structures)r   r   N)r   r   dictzip)	r#   graph_inputsgraph_outputsname_to_freeable_input_bufpeak_memorysnodes_curr_memorysnodes_allocfreebuf_to_snode_last_use_curr_memorys	            r+   _initialize_memory_trackingr      su    !7!M!M&.	
 	
 MK#%57L
 F$67788LL" r-   tuple[dict[BaseSchedulerNode, Optional[BaseSchedulerNode]], dict[BaseSchedulerNode, Optional[BaseSchedulerNode]], BaseSchedulerNode]c                    i }i }t          |           D ]A\  }}|dk    r| |dz
           nd||<   |t          |           dz
  k     r| |dz            nd||<   B| d         }|||fS )z/Create double-linked list structure from snodesr   r   N)	enumerater<   )r#   _prev_nextrE   r?   _heads         r+   _initialize_double_linked_listr      s     EEf%% F F5()Ava!e}}4e()CKK!O(;(;va!e}}e1IE%r-   Dtuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, ReorderInfo]]c                  /0123456789:;<=> d}| D ]}t          |          rd} n|s| i fS ddlm} t          |           }t	          t
          j        j                                                  }t	          t
          j        	                                          }t          | ||          \  }/>3}d | D             <i }	d:<fd}
d}t          |           \  210d;1fd}012fd}/45678fd}/3458>fd}t          j        }d}0}t          j        }d}1|         0|rn,t          |          r|||k    rn|dz  }t                      x}|	|<    |
| |1|         d                    x|_        |_        2|         }|}|8/|         d         7|t          |          r	d|_        n ||8          6 ||j        6d          }d |j        D             }|                                }d}|D ]0}|                    |                                d          x}r|} n1|d<d} ||          \  }} |rK|}t1          7/|         d                   7|xj        dz  c_        t5          6          |_        2|         }d| dt9          |                                           d|                                 d|                                g d t5          6           d!|  }!|!|_        nD>|         44j        4j        z
  5tA          t8                    }"3!                                D ]M\  }#=|#j"        j#        }$||$vrtI          =fd"6D                       s2|"=         %                    |#           N ||6|"          \  }%}&|%|k    rd#|% d$| |_        n|xj&        dz  c_&        |dz  } |||8            |
| |1|         d                    |_         ||6|"|&           |r5dd%l'm(}'  |'|6t5          6           |0d          |||/>d&|"          }|rn2|         }|1|         }1|         0|	::fd':D             9tS          9fd(9D                       }(tS          :fd):D                       }d*|( d+| d,;g d-})d. :!                                D             }*tT          j+        ,                    d/          rdd0l-m-}+ ; |+|*|)1          z  ;nE;d2z  ;;t]          |)          d3z   z  ;;d3/                    ta          t\          |*                    z  ; |0d          },t          |,          |k    sJ tc          |,||          \  }-}.}.}.;d4| z  ;;d5|- z  ;td          3                    ;           ti          d6d7 ;fd89           |,|	fS )=z
    Internal testing helper that also returns debug info.
    Returns:
        - reordered snodes list
        - dict {snode: ReorderInfo}
    FTr   r}   c                .    i | ]}|t          |          S r(   estimate_op_runtimer)   r?   s     r+   
<dictcomp>zJ_reorder_communication_preserving_peak_memory_internal.<locals>.<dictcomp>#  s0     0 0 0.3"5))0 0 0r-   collective_snoder"   remaining_snodesr$   rG   r[   c                    t          |           }d|D ]:}t          |          rt          |          r ndfd}t          ||           ;t	          d|z
            S )	N        r   r"   rG   r^   c                    |          z  d S rg   r(   )r   compute_timeruntimess    r+   accumulate_timezs_reorder_communication_preserving_peak_memory_internal.<locals>.exposed_communication_time.<locals>.accumulate_time7  s     00r-   r   )r   r"   rG   r^   )r   r   r   r   max)r   r   	comm_timer?   r   r   r   s        @r+   exposed_communication_timezZ_reorder_communication_preserving_peak_memory_internal.<locals>.exposed_communication_time)  s     ((899	% 	= 	=E"5)) U##  1 1 1 1 1 1 1 %UO<<<<1i,.///r-   headOptional[BaseSchedulerNode]tailc                `    g }| }	 ||                     |           ||k    rn	|         }'|S rg   appendr   r   r   r   r   s       r+   _group_nodeszL_reorder_communication_preserving_peak_memory_internal.<locals>._group_nodesB  I     	}

1DyyaA	 
r-   c                    |          }|r||<   ||<   |         }|r| |<   || <   || <   | |<   | k    r|d S d S rg   r(   )	candidate
group_head
group_tailcandidate_prevgroup_tail_nextr   r   r   s        r+    _perform_double_linked_list_swapz`_reorder_communication_preserving_peak_memory_internal.<locals>._perform_double_linked_list_swapO  s     y) 	/$.E.!*j  
+ 	/%.E/"*i &i%j IEEE r-   c                   i }d}|s.t          z
           d         z
  j        z             }||fS  }D ]T}|         d         |z   }|||<   t          ||          }|                    |d           }||D ]}	||	j        j        z  }U         d         |z   j        z   }
|
|| <   t          ||
          }||fS Nr   r   )r   
size_allocget
mpi_buffer	size_free)r   group_ns/group_n_to_bufs_after_swap_dealloc_by_candidate_post_alloc_updatepotential_peakmem_after_reorder_deltar   gn_post_alloc_membufsbufcandidate_mem_post_allocr   candidate_allocfreecandidate_delta_memr   group_peak_memoryr   s              r+    _calculate_potential_peak_memoryz`_reorder_communication_preserving_peak_memory_internal.<locals>._calculate_potential_peak_memoryi  s<   
 <>> 	6 !$77Z(+%&%01 N "#555 )<'; 		H 		HB ,R 0 36M M%6r" 1BCCNBFFr4PPD H HC+s~/GG++ $Q'%&!,- 	!
 )A9%^-EFF111r-   c                   |sO|D ]#}|         }|d         z
  |d         z
  f|<   $         d         j         z   }|j        z
  }||f| <   d S |                                D ]\  }}|D ]}	| |	<   d}
|D ][}||         }t          d ||         D                       }|
|z  }
|         xj        |z  c_        ||         j        z
  }||f|<   \||          }|          xj        |
z  c_        ||          j        z
  }||f| <   d S )Nr   r   c              3  .   K   | ]}|j         j        V  d S rg   r   r   r)   r   s     r+   r   zu_reorder_communication_preserving_peak_memory_internal.<locals>._update_memory_tracking_after_swap.<locals>.<genexpr>  s;       6 6 (6 6 6 6 6 6r-   )r   r   itemssum)r   r   r   r   r   cm_candidate_post_alloc_mem_candidate_post_free_memr   r   "size_free_to_move_to_candidate_sumr   _gn_post_alloc_memsize_free_to_move_to_candidategn_post_free_memcandidate_post_free_memr   r   r   r   r   r   s                   r+   "_update_memory_tracking_after_swapzb_reorder_communication_preserving_peak_memory_internal.<locals>._update_memory_tracking_after_swap  s    ? 	  !"%qE//qE//$R  
 Z(+.A.LL & *,?,II % *('L# F =BBDD	7 	7 
 7 7-6%c**7 34* 
	E 
	EA&8&;25 6 6J1M6 6 6 3 3* /2PP.Q))-KK))$69I!9L9V$V13CDLOO$6y$A!#--1SS--%(8(C(MM 	  &##
Yr-   Nr   zcollective orderingr   c                <    i | ]}t          |          |j        |S r(   r   namer)   r   s     r+   r   zJ_reorder_communication_preserving_peak_memory_internal.<locals>.<dictcomp>  s8       "#|TUFA  r-   r   tuple[bool, Optional[str]]c                J    t          |           rdS t          |           rdS dS )N)Fr   )Fr   TN)r   r   )r   s    r+   is_groupablezL_reorder_communication_preserving_peak_memory_internal.<locals>.is_groupable   s6     /y99 @#?#?-i88 ?#>#>)zr-   data dependency (dep_names:)
 candidate:z(outs:)dep on 
 non_group_reason:c              3  $   K   | ]
}|k    V  d S rg   r(   )r)   r   snode_last_uses     r+   r   zI_reorder_communication_preserving_peak_memory_internal.<locals>.<genexpr>:  s(      BBr^3BBBBBBr-   peak memory new:	 vs base:!_debug_iterative_memory_recomputerW   c                ,    i | ]}||         j         S r(   )ri   r)   r?   rV   s     r+   r   zJ_reorder_communication_preserving_peak_memory_internal.<locals>.<dictcomp>s  s#    PPPE5*U+7PPPr-   c                     g | ]
}|         S r(   r(   )r)   r?   ri   s     r+   r,   zJ_reorder_communication_preserving_peak_memory_internal.<locals>.<listcomp>t  s    IIIE[/IIIr-   c                *    g | ]}|         j         S r(   )rb   r  s     r+   r,   zJ_reorder_communication_preserving_peak_memory_internal.<locals>.<listcomp>u  s!    GGG5z%(.GGGr-   zAreorder_communication_preserving_peak_memory improved overlap by z
 ns after z reorders.
)zCollective nodezinitial exposedzfinal exposedri   limiting factorrb   rc   re   c           
         g | ]?\  }}t          |          |j        |j        |j        |j        |j        |j        |j        g@S r(   )node_summaryr\   r]   ri   r`   rb   rc   re   )r)   r?   	node_infos      r+   r,   zJ_reorder_communication_preserving_peak_memory_internal.<locals>.<listcomp>  sd        E9 %#!%O"		
  r-   tabulater  headers>Please `pip install tabulate` to nicely render overlap stats.

z
 peak_memory_before:z
 peak_memory_after:artifactc                     dddS )NrW   stringr   encodingr(   r(   r-   r+   <lambda>zH_reorder_communication_preserving_peak_memory_internal.<locals>.<lambda>  s    B 
 
 r-   c                      S rg   r(   )reorder_log_strs   r+   r  zH_reorder_communication_preserving_peak_memory_internal.<locals>.<lambda>  s    ? r-   metadata_fn
payload_fn)r   r"   r   r$   rG   r[   r   r   r   r   rG   r$   )r   r"   rG   r   )5r   r   r~   r<   r   r    graphr   keysget_output_namesr   r   r   (reorder_iterative_debug_limit_to_reorder(reorder_iterative_debug_memory_recomputerY   r\   r]   r`   	schedulerunmet_dependenciesget_outputsr   r   r   rc   r   re   r6   r   r   r   r   r   r   
succ_nodesr   r   rb   comms_debugr	  r   	importlibutil	find_specr  r_   r   mapr   overlap_loginfor
   )?r#   has_collectivesr?   r~   original_snodes_numr   r   r   r   statsr   total_movesr   r   r   r    debug_num_collectives_to_reordernum_processed_collectivescurr debug_iterative_memory_recomputeiterative_recompute_errorr3  r   r   group	data_depscandidate_outsdata_depor   r   is_groupable_resultgrouping_reasonmsgr   r   r,  r   r   r	  total_improvementr  rowsr  
new_snodesnew_peak_memoryr*   r   r   r   r   r   r   r   r   r   r   ri   rV   r  r   r  r   s?                                                  @@@@@@@@@@@@@@@@r+   rT   rT     s    O  u%% 	"OE	  rz>>>>>>f++$.qw/C/H/H/J/J$K$KL%/0H0H0J0J%K%KM 	$FL-HH"0 07=0 0 0H 35E0 0 0 0 0 0* K8@@E5%           4&2 &2 &2 &2 &2 &2 &2 &2 &2 &2P5
 5
 5
 5
 5
 5
 5
 5
 5
 5
p 	7 % &'D'-'V$ %
+
!$ 	t$$ [	./;)-MMM%*%!,.D5;8R8Rll5;559 9 D 4#5 dIJJ ,T 21 5'&y11 +@D(/;|J
/S/S,,N"&   ','?  	 "+!6!6!8!8'  A%MM!**,,===q #$ '	* 	* 	* 	* <H<	;R;R8'* %.
,/-|I/Fq/I- -) ),8,=,=)$))$4	 Ex E EDIYIYDZDZ E E,5,>,>,@,@E EIRIcIcIeIeHfE E&23&7&7E E 4CE E  03,3CI3N#'25H5RR $"  %% @ +0022" " "!$!:J 
22 BBBBcBBBBB ! C&fSkkkk5U5Us$S6 62 2 "K//Q>QQKQQ ( 

a

q 00J
SSS%?%?,,uT{D99& &" 32C&	   4  ONNNNN0Q0Q!$S))$UD112%#$(FG1 1- 1 !*-	W 'X T{ +
!B JPPPPZPPPKIIII[IIIJJGGGGJGGGHHK	,L] 	, 	,	, 	, 	, 	 	 	G  !+ 0 0 2 2  D ~
++ 5%%%%%%88
 
 
 	

 	M	
 	3w<<$..499Sd^^444eT**Jz??11111=.   OQ1 ====O@@@@O_%%%
 
 +***    ur-   rJ   rK   rL   c                B   i }i i i i ct          |           D ]\  }}|                                D ]}|||<   |                                D ]}||<   ||                                <   |                                }	t          j        |	<   d|	<   ||	<   d}
| D ]}|rit          |          rZ|
|                                <   |j        D ]5}|                                         }t          |         |
          |<   6|
dz  }
m|r&t          |          rd|                                <    G fdd          d | D             g t          t                    d | D                                             D ]V\  }}t          |          dk    rt          j         |                     |D ]}|                             |           Wg fdfdfd	}t                    rPt          j                  j        }|rt          |          r ||           n |           t                    P                                D ]%\  }}t          |          dk    sJ d
             &S )a  
    Schedule `snodes` for various comm optimization objectives.

    Args:
        snodes: the nodes to be scheduled.
        raise_comms: whether to greedily schedule collectives as early as possible
        sink_wait: whether to greedily schedule waits as late as possible
        reorder_compute_for_overlap: whether to reorder compute nodes to
            optimize for compute/communication overlapping.

    Returns:
        The new schedule order.

    Some notes on the synergy between different options:
        - `raise_comms` provides more overlapping oppurtunies for `reorder_compute_for_overlap`.
        - When both `raise_comms` and `sink_waits` is `True`, `raise_comms` is prioritized.
    r   r   c                  (    e Zd Zd fdZd ZdS )$_schedule_for_comm.<locals>.RunnablerG   r^   c                    || _         t          t          |                                                    }|                                         }|         |         |         f| _        d S rg   )r?   nextiterget_operation_namesr   score)rh   r?   r   
fused_namename_to_fused_nodescores_0scores_1scores_2s       r+   __init__z-_schedule_for_comm.<locals>.Runnable.__init__  sf    DJU668899::D+D1::<<J$$$DJJJr-   c                "    | j         |j         k     S rg   rP  )rh   others     r+   __lt__z+_schedule_for_comm.<locals>.Runnable.__lt__  s    :++r-   N)rG   r^   )rj   rk   rl   rV  rZ  )rR  rS  rT  rU  s   r+   RunnablerK    sQ        	 	 	 	 	 	 	 	 		, 	, 	, 	, 	,r-   r[  c                L    i | ]!}|t          d  |j        D                       "S )c              3  $   K   | ]}|j         V  d S rg   r   )r)   deps     r+   r   z0_schedule_for_comm.<locals>.<dictcomp>.<genexpr>  s$      GGs#(GGGGGGr-   )r   r*  r   s     r+   r   z&_schedule_for_comm.<locals>.<dictcomp>  sD     < < < 	zGGe.FGGGGG< < <r-   c                .    i | ]}|t          |          S r(   r   r   s     r+   r   z&_schedule_for_comm.<locals>.<dictcomp>  s#    KKK5U/66KKKr-   c                                        |            |                                 D ]_}|         D ]T} |                              |           t          |                    dk    rt	          j         |                      U`dS )zU
        Schedules `snode` and put all unblocked nodes onto the ready queue.
        r   N)r   r   remover<   heapqheappush)r?   buf_namer[  buffer_usersready	scheduled
unmet_depss     r+   schedulez$_schedule_for_comm.<locals>.schedule  s     	..00 	; 	;H%h/ ; ;5!((222z%())Q..N5((5//:::;	; 	;r-   c                 j    d D             } t          |           dk    rdS t          | d           S )zh
        Return the next node in the ready queue that's neither a collective or
        a wait.
        c                b    g | ],}t          |j                  t          |j                  *|-S r(   )r   r?   r   r   s     r+   r,   zI_schedule_for_comm.<locals>.get_overlapping_candidate.<locals>.<listcomp>%  sM     
 
 
&qw//
 9Fag8N8N

 
 
r-   r   Nc                    | j         S rg   rX  r   s    r+   r  zG_schedule_for_comm.<locals>.get_overlapping_candidate.<locals>.<lambda>,  s    QW r-   key)r<   min)
candidatesrg  s    r+   get_overlapping_candidatez5_schedule_for_comm.<locals>.get_overlapping_candidate   sP    

 

 
 


 z??a4:#4#45555r-   c                *   t          |           sJ  |            |          }|dk    rS             x}G                    |            |j                   ||j                 z  }|dk    r             x}Gt          j                   dS )z
        Schedules collective node `snode`, along with one or more compute nodes
        to overlap with it. The strategy is described in the comment of
        `reorder_compute_for_overlap`.
        r   N)r   rb  r?   rc  heapify)r?   collective_costr   rs  rg  rj  snode_to_costs      r+   schedule_collective_for_overlapz;_schedule_for_comm.<locals>.schedule_collective_for_overlap.  s     #5)))))'.a77999FLL###HY_%%%}Y_==O a77999F
 	er-   z;Detected unscheduled nodes. Nodes with unmet dependencies: )r   r   rO  r   sysmaxsizer   	ancestorsrq  r   r   r   r   r<   rc  rd  addheappopr?   )r#   rJ   rK   rL   buf_name_to_snodeidxr?   re  op_name	node_namecomm_idxancestoranc_fused_namedepsr_  rx  r[  rf  rs  rR  rg  rj  rh  rS  rT  rU  rw  ri  s                   @@@@@@@@@@@@r+   rN   rN     s   L #%r2 Hh'' " "
U..00 	0 	0H*/h''0022 	0 	0G*/w''/45>>++,NN$$	!k!H + + 	+.u55 	+)1HU^^%%&!O S S!3H!=!F!F!H!H+.x/G+R+R((MHH 	+M%00 	+)*HU^^%%&, , , , , , , , , , , , ,< << < <J
 E=H=T=TLKKFKKKM!'')) ) )tt99>>N5((5//222 	) 	)C!!%((((	) I	; 	; 	; 	; 	; 	; 	; 	; 	;6 6 6 6 6       & e** e$$* 	#6u#=#= 	++E2222HUOOO e**  "'')) 
 
t4yyA~~~V*VV ~~~ r-   nodesc           	        t           j                                        s| S d | D             }t          dt	          |                    D ]}t          t          ||                                                             }||dz
                                           D ]-}||                             t          ||d                     .| S )z
    Decide global ordering of comms, by just enforcing the ordering that's in the input graph
    (might not be the same ordering as the eager mode program).
    TODO: Come up with a better approach
    c                0    g | ]}t          |          |S r(   )r   )r)   r   s     r+   r,   z3decide_global_ordering_of_comms.<locals>.<listcomp>Z  s&    ===&9!&<&<=!===r-   r   Tmutating_bufr   )
r8   r1   is_availabler4   r<   rM  rN  r   add_fake_depr   )r  name_to_bufrR  
comm_nodesrE   r  r   s          r+   decide_global_ordering_of_commsr  O  s     ))++ ==U===J1c*oo&&  DA!?!?!A!ABBCCa!e$5577 	 	CqM&&,EEE   	
 Lr-   c                  V    e Zd ZU dZded<   dZded<   dZded<   dZded<   d	Zded
<   dS )SinkWaitInfor   ra   rc   rd   r_   re   rb   
moves_infor^   r`   N)	rj   rk   rl   rc   rn   re   rb   r  r`   r(   r-   r+   r  r  g  sc         GLENNNNJ!O!!!!!!r-   r  Etuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, SinkWaitInfo]]c                  0123456789: ddl m} t          |           }|dk    r| i fS t          t          j        j                                                  }t          t          j                                                  }t          | ||          \  }0:}}t          |           \  321i }d02fd}	045678:fd	}
123fd
}045:fd}| d         }t                      }t          j        }t          j        }d}3|         _|rn[|t          |          |k    rnDt          |          r#||vr|                    |           t!                      x}||<   2|         }|}|7|}0|         d         8||rn |	7|          6 ||j        6d          }d |j        D             }|                                }d }|D ]0}|                    |                                d           x}r|} n1| t-          |          ot-          |          x}rd } ||          \  }}|rL|}t/          80|         d                   8|xj        dz  c_        t3          6          |_        2|         }|0|r.dt3          6           d|                                 |_        nd| dt9          |                                           d|                                 d|                                g d6 dd |D              d| |_        n;:|         44j        4j        z
  5tA          t8                    } |!                                D ]C\  }!}"|!j"        j#        }#|"|k    rd }$6D ]}%|%|#v r|%}$	|$(| |$         $                    |!           D |
|6|           \  }&}'}(|&|k    rd|& d| |_        n|xj%        dz  c_%        |xj&        d|                                 z  c_&         ||7|            ||6| |'|(           |r5dd l'm(})  |)|6t3          6           |	1d           |||0:d!|           }|rn2|         }|׉3|         }3|         _g d"}*d# |!                                D             }+d$9tR          j*        +                    d%          rdd&l,m,}, 9 |,|+|*'          z  9nE9d(z  99t[          |*          d)z   z  99d).                    t_          tZ          |+                    z  9t`          1                    9            |	1d           }-t          |-          |k    sJ te          |-||          \  }.}/}/}/9d*| z  99d+|. z  9tg          d,d- 9fd./           |-|fS )1Nr   r}   r   r   r   rG   r$   c                `    g }| }	 ||                     |           ||k    rn	|         }'|S rg   r   r   s       r+   r   z4_sink_waits_iterative_internal.<locals>._group_nodes  r   r-   c                            d                  j         z
  }i }i }d}|s t          z   |j         z             }|||fS |j         z   }||| <   |}t          d t          j                            |                                          D                       }| || <   |z   }	D ]T}
|
         d         |	z   }|||
<   t          ||          }d}|
|v r!||
         }|D ]}||j        j        z  }|||
<   |	|z  }	U|||fS )Nr   c              3  .   K   | ]}|j         j        V  d S rg   r   r   s     r+   r   z[_sink_waits_iterative_internal.<locals>._calculate_potential_peak_memory.<locals>.<genexpr>  s;       *
 *
 N$*
 *
 *
 *
 *
 *
r-   )	r   r   r   	itertoolschainfrom_iterabler7   r   r   )r   r   7group_n_to_bufs_after_swap_dealloc_instead_of_candidatepre_group_memr   _size_free_delta_updater   candidate_post_alloccandidate_size_free_to_move	delta_memr   gn_post_allocgn_size_free_to_addr   r   r   r   r   r   r   r   r   s                  r+   r   zH_sink_waits_iterative_internal.<locals>._calculate_potential_peak_memory  s    $Q'*::*F*QQ 	 <>@BF 	O !$77 3 >> N "#57NNN,/B/MM(<9%-&) *
 *
 44GNNPP *
 *
 *
 '
 '
# /J-I	*'*EE	 
	- 
	-B(,Q/);M%2r" ??N"#LLLNrR D DC'3>+CC''.A'+,,II13JJJr-   c                    |         }|r| |<   || <   |          }|r||<   ||<   | |<   || <   |k    r| d S d S rg   r(   )r   r   r   group_head_prevcandidate_nextr   r   r   s        r+   r   zH_sink_waits_iterative_internal.<locals>._perform_double_linked_list_swap  s      
+ 	/%.E/"*i y) 	/$.E.!*j &j%iEEE r-   c                X   |d         }|         d         |         j         z
  }|sA|j         z   }||j        z
  f| <   |D ]#}|         }	|	d         z   |	d         z   f|<   $d S | g|D ];}
||
         }|
         xj        ||
         z  c_        |||
         j        z
  f|
<   <d S r   )r   r   )r   r   r  r   r  r   r  r  r   r   r   
post_allocr   r   r   r   s               r+   r   zJ_sink_waits_iterative_internal.<locals>._update_memory_tracking_after_swap  s    V
$Q'*::*F*QQ 	 G 	#03F3Q#Q $$':'DD'L#   !"%qE//qE//$R   F"c" 	 	A+A.JQ))-DQ-GG))-a0::LOO	 	r-   rZ   FTr   c                <    i | ]}t          |          |j        |S r(   r   r   s     r+   r   z2_sink_waits_iterative_internal.<locals>.<dictcomp>  s8       '??FA  r-   c                    t          |           rdd|                                  fS t          |           rdd|                                  fS dS )NFzcandidate contains collective zcandidate contains gemm_like r   )r   r   r   r?   s    r+   r   z4_sink_waits_iterative_internal.<locals>.is_groupable0  so    .u55  % SAQAQ S S$  .e44  % R@P@P R R$   *zr-   r   zcollective ordering z with candidate:r   r   r  z(os:r  z
 outs:c                6    g | ]}|                                 S r(   r   r)   rA  s     r+   r,   z2_sink_waits_iterative_internal.<locals>.<listcomp>S  s     'I'I'I

'I'I'Ir-   r  r  r  +r  sink_waits_iterative)z	Wait noderc   re   rb   r  r  c                p    g | ]3\  }}t          |          |j        |j        |j        |j        |j        g4S r(   )r  rc   re   rb   r  r`   )r)   r?   r3  s      r+   r,   z2_sink_waits_iterative_internal.<locals>.<listcomp>  sV     
 
 
 E4 LJO 	

 
 
r-   rd   r  r  r  r  r  z*
 sink_waits_iterative peak_memory_before:z)
 sink_waits_iterative peak_memory_after:r  c                     dddS )Nsink_waits_iterative_infor  r  r(   r(   r-   r+   r  z0_sink_waits_iterative_internal.<locals>.<lambda>  s    / 
 
 r-   c                      S rg   r(   )log_strs   r+   r  z0_sink_waits_iterative_internal.<locals>.<lambda>  s    7 r-   r   r#  )4r   r~   r<   r   r    r$  r   r%  r&  r   r   r   r(  (sink_waits_iterative_debug_limit_to_sinkr   r|  r  r)  r*  r+  r   r   r   r   rc   r   re   r`   r6   r   r   r   r   r   r   r,  r   rb   r  r-  r	  r.  r/  r0  r  r_   r   r1  r2  r3  r   r
   );r#   r~   r5  r   r   r   r   r   r6  r   r   r   r   r:  processed_waitsr;  debug_num_sink_waits_to_reorderr<  r3  r   
wait_snoder   r=  r>  
group_outsr@  rA  r   both_contain_commsr   is_grp
grp_reasonr  r   r  r,  last_succ_gnr   r   r   r  r	  r  rF  r  rG  rH  r*   r   r   r   r   r   r   r   r   r   r  r   s;                                                   @@@@@@@@@@@r+   _sink_waits_iterative_internalr  p  sV    ?>>>>>f++arz$.qw/C/H/H/J/J$K$KL%/0H0H0J0J%K%KM 	$FL-HH" 9@@E5%35E     )K )K )K )K )K )K )K )K )K )K )KV      *       B ":D llO'-'V$7 $ !&
+
!$ 	+7O$$(GGG _	.4#>#>%%%!-/D5;dIJJJ ,T 21 5', /;|J
/S/S,,("&   &9  	 #..00
#  A%MM!**,,===q #$
 '+E22U7J97U7U& (* * * *6i)@)@&FJ %.
,/-|I/Fq/I- -) ),8,=,=)$))$4	 "*0B*F<3D3D F F/8/A/A/C/CF F , @x @ @DIYIYDZDZ @ @,5,>,>,@,@@ @GPGaGaGcGcFd@ @&)@ @ (J'Ij'I'I'I@ @ 4>	@ @ , 3CI3N#'25H5RR $  %% H +0022" " "!$!:J%22 #'L! . .+++-L#+  L$fSkkkk 54!O  L 24K "K//Q>QQKQQ ( 

a

#=y'9'9';';#=#==00J
SSS22K&+   4 NNNNNN0Q0Q!$S))$UD112%#$(.O1 1- 1 !*-	o 'p T{S +
!V  G
 
 !;;==
 
 
D G~
++ 
-%%%%%%88
 
 
 	

 	TT3w<<$&&499Sd^^,,,WeT**Jz??11111=.   OQ1 J[JJJGMOMMMG
 
 #???    ur-   c                ,    t          |           d         S )Nr   )r  rO   s    r+   r  r    s     *&11!44r-   r[   c                    t           j        dk    r|                                 }n/t          t           j                  sJ t          j        |           }|S )z:
    Returns estimated op runtime in nanoseconds (ns)
    ry   )r   r   r/   callable)r?   runtimes     r+   r   r     sQ     !Y..--//233333,U33Nr-   c           	     <   |                                  }t          |          dk    r(d}t          | j        t          j        t          j        f          r`dd |                                 D              }dd | j        D              }d| 	                                 d| j        j
         d	| d
| d	}d |                                  D             }d                    d |D                       }	 | j                                        }n# t          $ r d}Y nw xY w| j        j        j         | | d| d|                                 ddS g }|D ]$}	|                    t%          |	                     %| j        j         dd                    |           S )Nr   rd   zouts:c                6    g | ]}|                                 S r(   r   r  s     r+   r,   z node_summary.<locals>.<listcomp>  s     JJJ

JJJr-   zins:c                    g | ]	}|j         
S r(   r^  r   s     r+   r,   z node_summary.<locals>.<listcomp>  s    GGGafGGGr-    z (z)
 z
 ()c                @    g | ]}|j                                         S r(   )rp   get_output_spec)r)   childs     r+   r,   z node_summary.<locals>.<listcomp>  s&    OOOE5:--//OOOr-   ,c                l    g | ]1}t          |t          j                  rd |j         d|j         dnd2S )z (size=z	, stride=r  rd   )r   r   Layoutsizestride)r)   layouts     r+   r,   z node_summary.<locals>.<listcomp>  s[         fbi00@&+@@@@@@  r-   z.0fz ns): z, )	get_nodesr<   r   rp   r   ExternKernelOut_CollectiveKernelr+  r*  r   rt   r   maybe_get_nameAttributeError	__class__rj   r/   r   r  )
r?   r#   detailouts_strins_strlayoutsout_tensor_infor  	summarieschild_snodes
             r+   r  r    s   __F
6{{aej2#5r7K"LMM 	iLJJe6G6G6I6IJJJLLHIGGe.FGGGIIGh))hhUZ-JhhPXhh^ehhhFOOU__=N=NOOO((  &	  
 
	
1133II 	 	 	III	 *&/  A  A  A  AI  A  AY^YtYtYvYv  A  A  A  A  	A I 4 4k223333o&@@$))I*>*>@@@s   4D DDc                   d}d }d }t          |           D ]\  }}|kt          |          r|t          |          z  }|j        }n't	          |j                  rn|t          |          z  } ||t          |                      st          |          r4|t          |          z  }|j        } ||t          |                      t	          |j                  r ||t          |                      d } ||dt          |                      t                              d|dz  dz              d S )Nr   c                F    t                               | dd|            d S )Nz>6r  )r2  debug)steprD  s     r+   step_logz#visualize_overlap.<locals>.step_log  s-    T-----.....r-   z| zEst. runtime (ms): i  )r   r   r   rp   r   r  r2  r  )ordertotal_est_runtimecur_comm_noder  r  r?   s         r+   visualize_overlapr    s     #M/ / / !'' ; ;e "5)) 	@!%8%?%??! %
$$ @ !%8%?%??!HTl51134444"5)) ;!%8%?%??! %
,u"5"578888$$ ;,u"5"57888 $9L$7$799::::?/$6=??    r-   c                   | }t          t          j        j                                                  }t          t          j                                                  }t          j        D ]}t          |t                    r$|t                      v rt                      |         }t          |          sJ d| d            t          | t          | |          |          \  }}t          j                                        dk    ret"                              d| d|d           	 t'          |           n3# t(          $ r&}t"                              d|           Y d }~nd }~ww xY wt+          j                    } ||          }t+          j                    |z
  }	t          j                                        dk    ret"                              d	| d
|	 d           	 t'          |           n3# t(          $ r&}t"                              d|           Y d }~nd }~ww xY wt          | t          | |          |          \  }}t-          d|           |S )Nz3Invalid reorder_compute_and_comm_for_overlap pass: z is not callabler   z.==== Visualize overlap before reordering pass z, peak_memory=z ====rd   )exc_infoz-==== Visualize overlap after reordering pass z	 (ran in z	 sec)====zfinal peak_memory=)r   r    r$  r   r%  r&  r   'reorder_for_compute_comm_overlap_passesr   r_   globalsr  r   r   r8   r1   get_rankr2  r  r  	Exceptiontimeprint)
r#   r  r   r   pr   r*   et0ts
             r+   $reorder_compute_and_comm_for_overlapr  2  s    E$.qw/C/H/H/J/J$K$KL%/0H0H0J0J%K%KM; ' 'a 	!wyy..		!A{{ 	
 	
U!UUU	
 	
{ .*6<@@-
 
Q %%''1,,YYYkYYY  2!%(((( 2 2 2!!"q!111111112Y[[%IKK"%%''1,,XXXAXXX  2!%(((( 2 2 2!!"q!111111112-*6<@@-
 
Q 	%{%%&&&&Ls0   )D99
E)E$$E)$G44
H$>HH$r$  torch.fx.Graphc           
     *   t          | j                  t          t                     t          t                     t                    D ]\  }}|j        dk    r|j        t          j        j        j	        j
        k    r|j        d         j        dk    sJ d| d|j        d          d            |j        d         }|j        d         }|dk    r|                             |           |                             |           fd}t          t                     }t                    D ]\  }}|j        dk    ru|j        t          j        j        j        j
        k    rQ|}|j        d         j        dk    sJ d	 d
|  d             |          r|                             |           d }d D ]y}|j        dk    rlt          |j        t          j        j                  rH|j        j        j        r7 ||          s, ||                                          rJ d| d            z|                                D ]\  }	t          |	          D ] \  }
}|         }|j        d         u sJ |j        \  }|dz   }|
t-          |	          dz
  k     r|	|
dz            nt-                    dz
  }||         }t/          fd|D                       rJ d d| d|  d            |D ]b}|j        dk    rU|j        v rL|j        t          j        j        j	        j
        k    r(t1          fd|j        D                       }||_        c|                                D ]7\  }	t          |	          D ]"\  }
}|         }|                     |           #8D ]U}|j        dk    rH|j        t          j        j        j	        j
        k    r$|j        d         |v r|                     |           VdS )a  
    This FX graph pass replaces uses of FSDP2 unsharded params with their corresponding
    graph intermediates that were fsdp.copy_ into the unsharded params in the original graph.

    NOTE: Can only apply this pass to any of the FSDP2 unsharded params that have this pattern
    (or repetition of): `resize_(full) -> copy_ -> resize_(0)`. Because of this, for partial-graph case
    where `resize_(full) -> copy_` is in one graph and `resize_(0)` is in another graph, we can't
    remove these resize and copy ops and thus we will have worse performance there.

    In other words, "do we try to remove all the resize_(full) -> copy_ -> resize_(0) nodes for this unsharded param"
    is actually a per-unsharded-param decision, since for each unsharded param, we look at its resize sequence pattern
    (in `check_resize_pattern()`) to determine if its set of resize and copy nodes can be removed.
    call_functionr   placeholderz1Resize can only operate on graph inputs, but got z# which is resizing non-graph-input r  r   c                                        | g           }                     | g           }t          |          t          |          k    s@t                              d|  dt          |           dt          |           d           dS t	          ||          D ]D\  }}||k    r9t                              d|  d|          d| d	|          d| d
            dS EdS )NzH
Unequal number of resize-to-full and resize-to-0 nodes for graph input z:
z vs. zK.
Skipping `remove_fsdp2_unsharded_param_graph_input_usage` FX graph pass.
Fz
For graph input z: resize-to-full node z
 at index z 
happens after resize-to-0 node zd.
Skipping `remove_fsdp2_unsharded_param_graph_input_usage` FX graph pass for that unsharded param.
T)r   r<   logwarningr   )graph_inputresized_to_full_idxesresized_to_0_idxesresize_to_full_idxresize_to_0_idx&graph_input_to_resized_to_0_node_idxes)graph_input_to_resized_to_full_node_idxes	node_lists        r+   check_resize_patternzLremove_fsdp2_unsharded_param_graph_input_usage.<locals>.check_resize_pattern}  sm    !J M M!
 !
 DGGUWXX())S1C-D-DDDKKHS  "%&8"9"9     5 47!#54
 4
 	 	/ "_44 4=>P4Q ]o  )/ : FU     uu 5 tr-   z\
Assumed all FSDP2 `unsharded_param`s to be graph input, but it's not true!
Offending node: z	. Graph: c                    | j         t          j        j        j        j        k    p#| j         t          j        j        j        j        k    S rg   )targetr8   rv   fsdpcopy_ry   inductorresize_storage_bytes_)rp   s    r+   is_allowed_mutationzKremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_allowed_mutation  s6    K59>/77 O{ei0FNN	
r-   c                4    t           j        t          j        j                  r(d t           j        j        j                  D             ng }t           fd|D                       }t          d |D                       }t          ||z            dk    S )Nc                >    g | ]\  }}|j         |j         j        |S rg   )
alias_infois_write)r)   rE   r   s      r+   r,   zyremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_node_mutating_unsharded_param_or_its_alias.<locals>.<listcomp>  s8       Aq<+0E+ +++r-   c                ~    g | ]9}t          j        |         j        d                                                    :S val)r   argsmetauntyped_storage)r)   rE   rp   s     r+   r,   zyremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_node_mutating_unsharded_param_or_its_alias.<locals>.<listcomp>  sK        ty|07GGIIJJ  r-   c                f    g | ].}t          |j        d                                                    /S r  )r   r  r  )r)   unsharded_params     r+   r,   zyremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_node_mutating_unsharded_param_or_its_alias.<locals>.<listcomp>  sD       # 3E:JJLLMM  r-   r   )
r   r  r8   _ops
OpOverloadr   _schema	argumentsr   r<   )rp   unsharded_paramsmutated_arg_idxesmutated_node_arg_storagesstorages_of_unsharded_paramss   `    r+   -is_node_mutating_unsharded_param_or_its_aliaszeremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_node_mutating_unsharded_param_or_its_alias  s     $+uz'<==  %dk&9&CDD     	 %/   *  %
 %
! (2 '7  (
 (
$ ,/KKLLqPPr-   zdUser mutation on FSDP2 unsharded param is not allowed when Traceable FSDP2 is used. Violating node: c              3  2   K   | ]} |g          V  d S rg   r(   )r)   rp   r  r  s     r+   r   zAremove_fsdp2_unsharded_param_graph_input_usage.<locals>.<genexpr>  sG         >=d_DUVV     r-   z(Assumed no ops mutating unsharded param z in subgraph z, but it's not true!
Graph: c              3  (   K   | ]}|u rn|V  d S rg   r(   )r)   argreplacementr  s     r+   r   zAremove_fsdp2_unsharded_param_graph_input_usage.<locals>.<genexpr>  sE       % % (+o'='=3% % % % % %r-   N)r6   r  r   r   opr  r8   rv   r  r	  ry   r  r   r  r  r   r  r  r  
is_mutabler%  r   r<   r   tuple
erase_node)r$  r  rp   r  new_sizer  'unsharded_param_to_fsdp_copy_node_idxesfsdp_copy_noder
  fsdp_copy_node_idxesrE   fsdp_copy_node_idxr*   subgraph_start_idxsubgraph_end_idxsubgraph_nodesnew_argsr   r  r  r  r#  r  s                    @@@@@@r+   .remove_fsdp2_unsharded_param_graph_input_usager1  [  s    U[!!I 1<D0A0A--8->->*y)) P P	TG&&uy1GOOO9Q<?m333 6266 6[_[def[g6 6 6333 )A,Ky|H!||9+FMMcRRRR6{CJJ3OOO" " " " " " "J /:$.?.?+y)) 	U 	U	T7o%%$+9M9U*U*U!N"ilO"%666 9 9 9+09 9 9666 $#O44 U7HOOPSTTT
 
 
Q Q Q4   G&&4;
(=>> '#. ' ('-- '
 ED=BBDD   ei    , 
1	6	6	8	8") ") 	%./C%D%D 	) 	)!A!&'9:N!&q)_<<<<+0NA{!3a!7 s/001444 %QU++^^a' 
 ''9:J'JKN     *      )8 GU      ' 
) 
)G..'4944uy'9'O'WWW$ % % % % %#'9% % %    H !)DI
))	)H 
1	6	6	8	8- - 	%./C%D%D 	- 	-!A!&'9:N^,,,,	-
  # #G&&uy1GOOO	! GGGT"""# #r-   r^   c                  	 	 dd l 		j                                        sJ 	j        j        j        r	j        j        j        sJ n# t          t          t          f$ r Y d S w xY wddl
m}m}m}m}m} 	 	fd} |            } | |	j        j        j        j         |t"          j         |	j        j        j        j         |d           |d           |d           |d           |d	                     |d
                     |d           |d                    |d           d	fd            } ||            |                    |            d S )Nr   r   )CallFunction
KeywordArgMatchPatternMatcherPassregister_graph_patternc                    t          | j                  }|D ]e}|j        t          j        k    rN|j        d         j        j        j        j        j	        u r&|j        d         dk    r| 
                    |           fd S r   )r6   r  r  operatorgetitemr  rv   r  all_gather_copy_inry   r'  )gr  r   r8   s      r+   remove_unused_getitemz8reinplace_fsdp_all_gather.<locals>.remove_unused_getitemD  sv    MM	 	  	 AH,,,F1I$	(I(QQQF1INNQ	  	 r-   all_gather_inputsall_gather_outputinp_split_sizesall_gather_input_numelrankitem_idx
group_size
group_namec                $    | j         d         dk    S )NrC  r   )kwargs)matchs    r+   r  z+reinplace_fsdp_all_gather.<locals>.<lambda>d  s    %,z":a"? r-   )	pass_dictextra_checkrH  r5  c                    fd}|                      ||d         |d         |d         |d         |d         |d         |d         g           d S )	Nc                     | d d         }| d         }| d         } j         j        j        j        | }|d         }|d         }j         j        j                            ||||          }|S )NrZ   r   r   )out)rv   r  r;  ry   _c10d_functionalall_gather_into_tensor_out)	r  copy_in_argsrD  rE  r;  r:  	getitem_1all_gather_into_tensorr8   s	           r+   replzEreinplace_fsdp_all_gather.<locals>.reinplace_all_gather.<locals>.replg  s      9LbJbJ!J!B!J" )+G*1-I	*EMMZ N   #
 *)r-   r>  r?  r@  rA  rB  rD  rE  )replace_by_example)rH  r  rG  rT  r8   s       r+   reinplace_all_gatherz7reinplace_fsdp_all_gather.<locals>.reinplace_all_gatherQ  s    ,	* 	* 	* 	* 	*$ 	  *+*+()/0v|$|$	
 	
 	
 	
 	
r-   )rH  r5  )5torch.distributed.fsdp._fully_shard._fsdp_collectivesr1   r  rv   rO  rS  rP  ImportErrorr  AssertionErrorpattern_matcherr3  r4  r5  r6  r7  ry   r9  r:  r  r;  apply)
r$  r3  r4  r5  r6  r7  r=  
graph_passrV  r8   s
            @r+   reinplace_fsdp_all_gatherr]    s4   
DDDD --///// I&=	
	*E	
 	
F	
 8                 	  	  	  	  	  $#%%JI&=EL IN5=J233J233J011J788Jv&&  
:&&  J|$$J|$$	
 	
" ??'  *
 
 
 
 
+ *
@ %   Us   AA A"!A"c                    t          | t          j        j        j        t          j        j        j        f          rJ t          |                                 dd                    S )N   )r   r8   	_inductorr)  FusedSchedulerNoder~   ra   r   r  s    r+   
get_op_idxrb    s_    O%8O%:	
     u~~#$$$r-   1list[torch._inductor.scheduler.BaseSchedulerNode]r  4dict[str, torch._inductor.scheduler.SchedulerBuffer]rR  dict[str, BaseSchedulerNode]c           	     
   ! ddl m  g }t          t                               }d}d}i }i }i ! !fd}	| D ]}
t	          |
j        t          j        j        j	        j
                  rt          fd|
j        D                       rd}|
}t                      }t          |||           t          t          j        j        j	        j
        t          j        j        j        j
        t          j        j        j        j
        g          t#          ||| fd	           t%          |d
           }t'          |          }d}t)          t'          |                    D ]G}||         }t+          |j        t          j        j        j        j
                  r|dz  }|dk    r|} nH|d |         }d }t)          t'          |          dz
            D ]1}t-          ||dz            j        t.          j                  r|dz   } n2|J  |	|d |                   } |	||d                    }|||<   t+          |
j        t          j        j        j        j
                  rd}|
}t                      }t#          |||           t%          |d           }d }t)          t'          |          dz
            D ]1}t-          ||dz            j        t.          j                  r|dz   } n2|J  |	|d |                   } |	||d                    }|||<   t'          !          dk    sJ |rt'          |          dk    sJ |rt'          |          dk    sJ | D ]a}
|
                                !v r!|
                                         }
|
|v r7|                    |
           |                    |
           bd }|                                D ]\  }}||t=          t?          |                                                     }|!                                D ]9}|"                    tG          |                                |d                     :|}d }|                                D ]\  }}||t=          t?          |                                                     }|!                                D ]9}|"                    tG          |                                |d                     :|}|S )Nr   )r)  Fc                    j                             |           }| D ]}||                                <   ||                                <   |S rg   )r~   creater   )snodes_to_group
group_noder?   r)  snode_name_to_final_snodes      r+   _create_group_nodez:enforce_comm_ordering_for_fsdp.<locals>._create_group_node  s_    3::?KK
$ 	E 	EE:D%enn&6&677;E!*"5"5"7"78r-   )r$  c              3     K   | ]8}t          |         j        t          j        j        j        j                  V  9d S rg   )r   rp   r8   rv   r  r;  ry   )r)   r   rR  s     r+   r   z1enforce_comm_ordering_for_fsdp.<locals>.<genexpr>  sZ       
 
  "1%*EIN,M,U 
 
 
 
 
 
r-   Tc                v    t          | j                  p"t          | j                  o| j        j        v  S rg   )r   NopKernelSchedulerNodeExternKernelSchedulerNoderp   op_overload)r   allowed_opsr)  s    r+   r  z0enforce_comm_ordering_for_fsdp.<locals>.<lambda>  sA    q)"BCC "1i&IJJ >F.+=	' r-   )criteria_cbc                     t          |           S rg   rb  rn  s    r+   r  z0enforce_comm_ordering_for_fsdp.<locals>.<lambda>      JqMM r-   ro  r   c                     t          |           S rg   ru  rn  s    r+   r  z0enforce_comm_ordering_for_fsdp.<locals>.<lambda>  rv  r-   r  )$rd   r)  r   r   r   rp   r8   rv   rO  rP  ry   r   r{  r   wait_tensorr  split_with_sizes_copyr   sortedr<   r4   r   r   r   _WaitKernel	chunk_catr   r   r|  r   rM  rN  r   r+  r  r   )"r#   r  rR  	new_orderrh  	ag_exists	rs_exists$ag_grouped_node_to_wait_grouped_node$rs_grouped_node_to_wait_grouped_noderl  r?   ag_snodeag_related_snode_setag_related_snodesend_idx_of_current_ag_blockcopy_out_countrE   	cur_snodewait_node_idxag_group_nodeag_wait_group_noders_snoders_related_snode_setrs_related_snodesrs_group_noders_wait_group_nodeprev_ag_waitwait_group_noder  rA  prev_rs_waitrr  r)  rk  s"     `                            @@@r+   enforce_comm_ordering_for_fsdpr    sZ   
 )+I3!!III+-(+-( "       nU nUJ595PX
 
 
 l	U 
 
 
 
 _	
 
 
 
 
l	U IHLVLL  ($"	   %I.IQI.:BIN8@ K )$"        !'$*A*A! ! ! +..?*@*@'N301122  -a0	!NEIN$H$P  ( #a'N!A%%23/E & !22N3N2N O !M3011A566  /A6;R^LL $%EME !,,,../@-/PQQM "4!34Emnn4U!V!VBT0?? EJ	(@(HII 	UIH MWLL ($"	   !'$*A*A! ! !
 !M3011A566  /A6;R^LL $%EME !,,,../@-/PQQM "4!34Emnn4U!V!VBT0?())A---- =7881<<<< =7881<<<<   >>888-enn.>.>?EIe L*N*T*T*V*V ' '&#]%C%C%E%E F FGGL!--//  **AJJLL|TRRR    ' L*N*T*T*V*V ' '&#]%C%C%E%E F FGGL!--//  **AJJLL|TRRR    'r-   )r#   r$   )r#   r$   rG   r$   )rp   rq   rG   rr   )r?   r"   rG   rr   )F)rG   r_   )r   r$   rG   r_   )r#   r$   rG   r   )r#   r$   rG   r   )
r#   r$   rJ   rr   rK   rr   rL   rr   rG   r$   )r  r$   rG   r$   )r#   r$   rG   r  )r?   r"   rG   r[   )r$  r  )r$  r  rG   r^   )r#   rc  r  rd  rR  re  rG   rc  )V
__future__r   rc  r.  r  loggingr9  ry  r  collectionsr   dataclassesr   typingr   r   r   r	   r8   torch._loggingr
    torch.multiprocessing.reductionsr   torch.utils._ordered_setr   rd   r   r   dependenciesr   r   r   r)  r   memoryr   r   r   r   r   utilsr   r   r   r   r   r   r   virtualizedr    	getLoggerrj   r  _logginggetArtifactLoggerr2  r   r"   rF   rK   rJ   rR   rW   rY   r{   r   r   r   r   r   r   r   rT   rN   r  r  r  r  r   r  r  r  r1  r]  rb  r  r(   r-   r+   <module>r     s   # " " " " "            



  # # # # # # ! ! ! ! ! ! 6 6 6 6 6 6 6 6 6 6 6 6  + + + + + + ; ; ; ; ; ; / / / / / /         ! ! ! ! ! !  +%%%%%%%%******                                     g!!n..xCC <;;;;;;M M M M,         &# # # #L 9 9 9 9 9 9 9 9"   "( ( ( (      0 0 03 3 3 3  &   "m m m m`W W W Wt   0 " " " " " " " "h h h hV5 5 5 5	 	 	 	A A A># # #L& & & &RA# A# A# A#Hh h h hV% % %n n n n n nr-   