
    `i;(                        d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlZddl	m
Z
 ddlmZmZmZ ddlmZ  ej        e          Z G d d	e          Z G d
 de          Ze j        defd            Zde
j        defdZde
j        defdZde
j        defdZ G d de          Z G d de          Z G d de          ZdgdggZ dgdggdgdggdgdgggZ!g dg dg dgZ"dee#         fd Z$de
j        de#fd!Z%dS )"    N)IntEnum)Optional   )ir)get_dtype_sizesnode_args_kwargssympy_product)Vc                       e Zd ZdZdZdZdZdS )	NCCL_COLLr   r         N)__name__
__module____qualname__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER
ALL_TO_ALL     q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/comm_analysis.pyr   r      s"        JJNJJJr   r   c                       e Zd ZdZdZdZdS )NVIDIA_GPU_TYPEr   r   r   N)r   r   r   VOLTAAMPEREHOPPERr   r   r   r   r      s        EFFFFr   r   returnc                      t           j        j                            t           j        j        j                  pd} d| v rt
          j        S d| v rt
          j        S d| v rt
          j        S t
          j        S )N V100A100H100)	torchutilscollect_envget_gpu_inforunr   r   r   r   )gpu_infos    r   get_gpu_typer*       so    {&33EK4K4OPPVTVH$$	8		%%	8		%% %%r   nodec                    t          | t          j                  st          d|            | j        }|J d|v rt
          j        S d|v rt
          j        S d|v rt
          j        S d|v rt
          j	        S t          d|           )Nz!node is not a collective kernel: 
all_reduce
all_gatherreduce_scatterz-torch.ops._dtensor.shard_dim_alltoall.defaultzUnsupported collective kernel: )

isinstancer   _CollectiveKernel
ValueErrorpython_kernel_namer   r   r   r   r   )r+   kernel_names     r   get_collective_typer5   .   s    dB011 ECTCCDDD)K"""{""##		$	$##	[	(	(''	8K	G	G##H;HHIIIr   c                 .   d}| j         D ]}t          |j        j                  }t	          |t
          j                  rt          |          }n&t          j	        j
                            |d          }||t          |j        j                  z  z  }|S )Nr   )fallback)inputsr	   layoutsizer0   sympyIntegerintr
   graphsizevars	size_hintr   dtype)r+   sz_bytesinpnumels       r   get_collective_input_size_bytesrE   @   s    H{ = =cjo..eU]++ 	BJJEEG$..uq.AAEEN3:+;<<<<Or   c                     t          | t          j                  r6t          | t          j                  sddlm}  || j        d                   S t          d|            )Nr   )_get_group_size_by_namezUnsupported collective type: )r0   r   r1   _WaitKernel"torch.distributed.distributed_c10drG   constant_args	TypeError)r+   rG   s     r   get_collective_group_sizerM   M   sn    $,-- @jr~6V6V @NNNNNN&&t'9"'=>>>>>>???r   c                       e Zd ZdZdZdZdS )NCCL_HWr   r   r   N)r   r   r   NVLINKPCINETr   r   r   rO   rO   [   s        F
C
CCCr   rO   c                       e Zd ZdZdZdS )	NCCL_ALGOr   r   N)r   r   r   TREERINGr   r   r   rT   rT   a   s        DDDDr   rT   c                       e Zd ZdZdS )
NCCL_PROTOr   N)r   r   r   LLr   r   r   rX   rX   f   s         
BBBr   rX   g333333@gffffff@g333333?      ?g      @g@)     C@r[   gffffff4@)gU@g     6@g      3@c                    | j         }|J t          |dd          }d|v sd|v sd S ddlm} |j        d         } ||          }t
          j                            |          }t          j        d|           }t          |          }t          |           \  }	}
d	|v r|	d
d          |	d         z   }		 t
          j                            ||          5 } ||	i |
}t
          j        j        j                            |           d d d            n# 1 swxY w Y   n2# t           $ r%}t"                              |           Y d }~d S d }~ww xY w|j        }|dk     rd S |dz  }|S )Nr3   r    r.   r/   r   )_resolve_process_grouprH   zcuda:all_gather_into_tensor_outr   )groupdevice     @@)r+   getattrrJ   r]   rK   r$   distributedget_rankr`   evalr   _time_estimatorops_c10d_functionalwait_tensordefault	Exceptionloginfoestimated_time)snodekernelpy_kernel_namer]   pg_namepgrankr`   fnargskwargstime_estimatorweest_time_usest_time_mss                   r   /estimate_nccl_collective_runtime_nccl_estimatorr}      s   ZFV%92>>NN**.>..P.PtIIIIII"2&G			(	(B!**2..D \.$..))F	n		B$U++LD& $~55ABBx$q'!	..V / 
 
 	>D#F##AI&2::1===		> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>
    ttttt
 !/K Qt#Ks<   4!D 2DD DD DD 
E)E		Ec                    t          |           }|dz  dz  dz  }d}t          |           }t          j        ||z            }|}|dk    rdS t          j        }t          j        }t          |           }	t          j
        j        j        }
t          j
        j        j        }t                      }|dk    r|dz
  nd}|dk    r|nd}t          |         |         }|dk    r|
n|}d}||z  }t!          |||dk    s|	t"          j        k    rdndz            }|	t"          j        k    r	d|dz
  z  }n8|	t"          j        k    r	d|dz
  z  }n|	t"          j        t"          j        fv r|dz
  }d|z  |z  }||z  }|d	z  }t,          j        }|	t"          j        k    r|dk    rd|z  }n-d}n*|	t"          j        t"          j        t"          j        fv r|dz
  }t0          |         |         }t2          |         |         |         }t2          t,          j                 |         |         }d
}|dk    rd}t7          ||          }|||z
  |z  ||z  z   z  }|dz  }||z  }||z   }|dz  } | S )a:  
    Returns estimated NCCL collective runtime in milliseconds (ms).

    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
    We aim to estimate the runtime as accurately as possible.

    Assumptions:
    - only ring algorithm (NCCL_ALGO_RING) is used
    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
    - collective is one of: allreduce, reducescatter, allgather
    i      r   r   r   g      ?gUUUUUU?rZ   g    eAg        ra   g    .A)rE   rM   mathceilrT   rV   rX   rY   r5   r$   	_inductorconfigintra_node_bwinter_node_bwr*   llMaxBwsminr   r   r   r   r   rO   rP   baseLathwLatrR   max)!r+   tensor_storage_size_bytestensor_storage_size_GBnum_gpus_per_node
group_sizenNodesnRanks	nccl_algo
nccl_protocollbwIntrabwIntercompCapIndexindex2index1llMaxBwbw	nChannelsbusBwnstepsratio	bandwidthbandwidth_GB_per_nsintraHwnInterStepslatencyintraLatinterLatnetOverhead
latency_nstransport_nsnsmss!                                    r    estimate_nccl_collective_runtimer      s    !@ E E6=DtK *400JYz$5566FF{{q IJt$$D
 o$2Go$2G>>L!Q;;VaZZAF#q[[\\aFvv&G aKKWBINE !ty/C'C'C99)	U E y###fqj!	%	%	%fqj!	)*I,@A	A	A! 6\V#EI#c/ nGy###A::f*KKKK	)*I,@)BVW	W	Wqj i ,GW~i(4HW[!),Z8H Kzz8[))H$0;3IIIG3J *,??L	
	"B	cBIr   )&	functoolsloggingr   enumr   typingr   r;   r$   r    r   r%   r   r   r	   virtualizedr
   	getLoggerr   rl   r   r   	lru_cacher*   IRNoder5   r=   rE   rM   rO   rT   rX   r   r   r   floatr}   r   r   r   r   <module>r      s                             C C C C C C C C C C       g!!           g    
&o 
& 
& 
& 
&Jbi JI J J J J$
") 
 
 
 
 
@BI @# @ @ @ @    g          
        	
 		  
	 
	 
		,      ,)huo ) ) ) )Xf29 f f f f f f fr   