
    PiP                        d dl mZmZmZ d dlZd dlZdZdZdZdZ	ddd	d
ddddddd
ddddddd
dddddd
ddddddddZ
d/dee         fdZdZ	 d0ded ee         d!ee         d"eeej        ef                  fd#Z	 d/d$ej        d%ej        d&ej        d!ee         dee         d"ej        fd'Zd$ej        d%ej        d&ej        d ee         d!ee         dee         fd(Z	 d/d ee         d!ee         d)edee         fd*Z	 d0ded+ee         d"eeej        ef                  fd,Z	 d/d+ee         dee         fd-Z	 d/d$ej        d%ej        d&ej        d+ee         dee         f
d.ZdS )1    )ListOptionalUnionNg      ?         g vCg 
`Cg   .YvBg(\?gq=
ףp?)bf16_peak_topsfp8_peak_topspeak_mem_bw_bytes_secpct_achievable_gemm_topspct_achievable_mem_bwg sCg s/Cg s?Cg   B)r	   r
   fp4_peak_topsr   r   r   g ֒Cg ֒"Cg   xHBg  @Bg  @Bg  @Cg  BwC)r	   r
   r   r   )zNVIDIA H100zNVIDIA B200zNVIDIA GB200zAMD Instinct MI300XzNVIDIA GeForce RTX 5090gpu_namec                 ^    | t           j                            d          } t          |          S )Nr   )torchcudaget_device_namegpu_name_to_specs)r   s    {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/testing/training/roofline_utils.py	get_specsr   V   s(    :--a00X&&    g>Ftensor_rolefloat8_recipe_namemx_recipe_namereturnc                     ||z  }d}|dk    rb|dk    r.|rd}	n
t           |z  }	t           |z  t          |z  z   }
|
}|	d|
|g}n|rd}	n
t           |z  }	t           |z  dt          z  |z  z   }
|	d|
g}nd|dk    r|dk    rN|rdt          |z  z   }	nt           |z  t          |z  z   }	t           |z  dz  }t           |z  t          |z  z   }
|	||
g}n
|rdt          |z  z   t           |z  z   }	n t           |z  t          |z  z   t           |z  z   }	t           |z  t          |z  z   }|	|g}n|dk    rq|dv r*|rdt          |z  z   }	nt           |z  t          |z  z   }	|	g}n|dk    r3t           |z  t          |z  z   }	d}t           |z  t          |z  z   }
|	||
g}nGJ d
            |dk    r*|rdt          |z  z   }	nt           |z  t          |z  z   }	|	g}n|dk    rw|dv r;|rdt          |z  z   }	nt           |z  t          |z  z   }	t           |z  t          |z  z   }n3|dk    r#t           |z  t          |z  z   }	t          |z  dz  }n
J d
            |	|g}n|dv sJ d|            |dk    r;|rdt          |z  z   }	nt           |z  t          |z  z   }	t           |z  t          |z  z   }n:|rdt          |z  z   }	nt           |z  t          |z  z   }	t           |z  t          |z  z   }|	|g} fd|D             }d |D             }|S )a/  
    Calculates the roofline estimate of casting one of the gemm inputs
    (input, weight or grad_output) to float8 in fwd+bwd.

    Inputs: dim0 and dim1 (shape), tensor_role (input|weight|grad_output), recipe names
    Outputs: list of read/write traffic overhead in seconds, one for each kernel
    N
tensorwiseweightr   r   rowwiserowwise_with_gw_hp)inputgrad_outputFunsupported mxfp8_32x32_flexible_gemm_layoutmxfp8_32x32_weight)mxfp8_emulatedmxfp8_cublasmxfp8_cublas_rceilmxfp4_cutlasszunsupported mx_recipe_name=r)   c                 8    g | ]}|d          z  d         z  S r   r    .0xspecss     r   
<listcomp>z4get_tensor_memory_traffic_ovhd_s.<locals>.<listcomp>  ?        	
E)**U3J-KK  r   c                 B    g | ]}t          j        |t                    S r,   sympyMaxKERNEL_LAUNCH_OVERHEAD_SECr.   r/   s     r   r1   z4get_tensor_memory_traffic_ovhd_s.<locals>.<listcomp>  %    EEE!UYq455EEEr   )BYTES_PER_EL_BF16BYTES_PER_EL_FLOAT8BYTES_PER_EL_FLOAT4)r0   dim0dim1r   r   r   fuse_with_prevnumel	res_byteskernel_1_rwkernel_3_rwkernel_4_rwkernel_2_rwres_ss   `             r    get_tensor_memory_traffic_ovhd_srG   b   sj   " 4KEI\))(""  8 0%7+e36IE6QQK%K$akBII  8 0%7+e3a:M6MPU6UUK$a5II	y	(	((""  V"5"==/%7:MPU:UU+e3a7K+e36IE6QQK$k;?II  
 +e336G%6OO 
 &-)E12'%/0 
 ,e36IE6QQK$k2II	3	3	3222  V"5"==/%7:MPU:UU$IIH$$
 ,e36IE6QQKK+e36IE6QQK$k;?II'-''5	=	=	=  	R1E99KK+e36IE6QQK M			/	/	/
 222  V"5"==/%7:MPU:UU+e36IE6QQKKH$$ ,e36IE6QQK-59KK (-''5 +.		  "
 
 
 

 ,.++
 
 
 _,, V"5"==/%7:MPU:UU+e36IE6QQKK V"5"==/%7:MPU:UU+e36IE6QQK +.	     E FEuEEEELr   MKNc                    t          |          }d| z  |z  |z  }|t          j        u r	|d         }nG|t          j        t          j        fv r	|d         }n$|t          j        u r	|d         }nJ d|             ||z  |d         z  }	| |z  ||z  z   }
| |z  }|t|                    d          sJ d	|             |t          j        t          j        t          j        fv s
J d
            |                    d          rdnd}|
|z  }|
|z   }
|t          j        u r|
t          z  |t          z  z   }na|t          j        t          j        fv r|
t          z  |t          z  z   }n1|t          j        u r|
t          z  |t          z  z   }nJ d|             ||d         z  |d         z  }t          j        |	|t                    S )Nr   r	   r
   r   Fzunsupported dtype: r   )mxfp8mxfp4nvfp4zUnsupported recipe r#   mx       r   r   )r   r   bfloat16float8_e4m3fnfloat8_e5m2float4_e2m1fn_x2
startswithr:   r;   r<   r5   r6   r7   )rH   rI   rJ   dtyper   r   r0   gemm_ops	peak_topscompute_gemm_time_s	num_reads
num_writes
block_sizenum_scale_readsbytes_rwmem_gemm_time_ss                   r   get_individual_gemm_time_sympyra     s0    hE1uqy1}H*+			5&(9:	:	:/*			%(	(	(/*		33E3333u"Y.7Q1RR AAIQJ!(()DEE 	
 	
2.22	
 	
E "
 
 
 
 	
 
 
 *44T::BRR
#z1/	00:@Q3QQ	5&(9:	:	:22ZBS5SS	%(	(	(22ZBS5SS33E3333u5011E:Q4RR  9(/;UVVVr   c                     |||}	}}|dk    rt           j        }	t          | |||||          }
t          | |||||          }t          || ||	||          }|
|z   |z   }|S )Nr    )r   rR   ra   )rH   rI   rJ   rW   r   r   r   gemm_dtype_inputgemm_dtype_grad_inputgemm_dtype_grad_weightgemm_output_time_sgemm_grad_input_time_sgemm_grad_weight_time_stotals                 r   get_gemm_time_sympyrj   S  s     	 .D+
 111!&7	1a!>8  <	1a&  =	1a'  !77:QQELr   enable_fusion_modelingc           	          t          |          }t          || |d|||          }t          |||d||d          }	t          || |d|||          }
t          g ||	|
          }|S )Nr!   )r   r   r   r?   r   Fr"   )r   rG   sum)rH   rI   rJ   r   r   rk   r   r0   fwd_fp8_input_memfwd_fp8_weight_memgi_fp8_grad_output_memress               r   get_float8_mem_sympyrr   u  s     hE 9		-%-   :		-%   >		!-%-   P!P$6P9OP
Q
QCJr   recipe_namec                     |du s
J d            |dk    s
J d            ||z  }d}h d}|xdk    r& t           |z  }	t           |z  t          |z  z   }
|	|
g}nxdk    r' t           |z  t          |z  z   }	|	t          |z  z  }	|	g}nx}|rB|                    d	          r- t           |z  t          |z  z   }	|	t          |z  |d
z  z  z  }	|	g}n}|r|                    d          s|                    d          rdt           |z  t          |z  z   }	|                    d          r
|	t          z  }	|                    d          rd
nd}|	t          |z  ||z  z  z  }	|	g}n	 t          d| d|            fd|D             }d |D             }|S )z
    Inference version of `get_tensor_memory_traffic_ovhd_s`.
    The only thing happening here is we quantize the activation.
    Fr#   r!   z*inference only quantizes input activationsN>   mxfp4*mxfp8*nvfp4*r   r   r   r   rL   rP   rM   rN   rQ   zUnknown recipe name: z. Allowed recipes: c                 8    g | ]}|d          z  d         z  S r+   r,   r-   s     r   r1   z>get_inference_tensor_memory_traffic_ovhd_s.<locals>.<listcomp>  r2   r   c                 B    g | ]}t          j        |t                    S r,   r4   r8   s     r   r1   z>get_inference_tensor_memory_traffic_ovhd_s.<locals>.<listcomp>  r9   r   )r:   r;   BYTES_PER_EL_FLOAT32rV   r<   
ValueError)r0   r=   r>   r   rs   r?   r@   rA   allowed_recipesrB   rC   namer]   rF   s   `             r   *get_inference_tensor_memory_traffic_ovhd_sr~     s-    U"""M"""'!!!#O!!! 4KEIMMMO
\\\\ ,e3K+e36IE6QQK$k2IIYYYY ,e36IE6QQK/$66K$IITTTdoog66TT ,e36IE6QQK.5DDK$IITTtw77T4??7;S;ST ,e36IE6QQKw'' 433#w77?RJ.59KLLK$II6 6 6$36 6       E FEuEEEELr   c                 p    t          |          }t          || |d|d          }t          g |          }|S )Nr!   F)r   rs   r?   )r   r~   rm   )rH   rI   rJ   rs   r   r0   rn   rq   s           r   get_inference_float8_mem_sympyr     sV     hE C		   "!"
#
#CJr   c                 .    t          | |||||          }|S N)ra   )rH   rI   rJ   rW   rs   r   rf   s          r   get_inference_gemm_time_sympyr     s)     8	1aX  r   r   )F)typingr   r   r   r5   r   r<   r;   r:   rz   r   strr   r7   SymbolfloatrG   ra   rj   boolrr   r~   r   r   r,   r   r   <module>r      s   ) ( ( ( ( ( ( ( ( (      
 ! !'$( "&  " "( %) "& & " "( %) "& $ " !' %) "&  # !)   uB B J' ' ' ' ' ' +  v v 	v
 !v SMv 
%e#
$%v v v v~ #5W 5W|5W|5W |5W
 SM5W sm5W \5W 5W 5W 5Wp|| |
 ! SM sm   R #3 3 !	3
 SM3 !3 sm3 3 3 3x M M 	M
 #M 
%e#
$%M M M Mj #  #	
 sm   8 # || |
 # sm     r   