
    fPi%a                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZmZmZmZ d dlZ ej        e          ZddZd	 Z d
 Z!d Z"d Z#d Z$d Z%edk    r e%             dS dS )    )annotationsN)setup_logger)add_io_bindings_as_tensorsget_initial_inputs_and_outputs)
AutoConfigAutoModelForCausalLMAutoTokenizerBitsAndBytesConfigargsargparse.Namespacec                    | j         dv rd }| j        dk    rx| j        dk    rmt          dddt          j                  }t          j        | j        dk    r| j        n| j	        | j
        | j        | j        | j        dd|| j        d	i
	  	        }n	 t          j        | j        dk    r| j        n| j	        | j
        | j        | j        | j        d| j        dk    rdnd                              | j                  }n# t"          $ rx}t%          d|           t          j        | j        dk    r| j        n| j	        | j
        | j        | j        | j        dd                              | j                  }Y d }~nd }~ww xY w|                                 | j         dk    rt	          j        |          }nHt+          j                    }| j        dk    rdd| j        ifnd}t+          j        | j        ||g          }|S )N   pt-eager
pt-compileint4cudaTnf4)load_in_4bitbnb_4bit_use_double_quantbnb_4bit_quant_typebnb_4bit_compute_dtype flash_attention_280GB)	cache_dirtorch_dtypeuse_auth_tokentrust_remote_code	use_cacheattn_implementationquantization_config
max_memorysdpa)r   r   r   r   r   r    z&Try to load a model using eager mode: eagerr   CUDAExecutionProvider	device_idCPUExecutionProvider)sess_options	providers)benchmark_typeonnx_precisiondevicer
   torchfloat16r   from_pretrainedhf_dir_path
model_namer   r   authtrustr&   totarget_device	ExceptionprintevalcompileortSessionOptionsInferenceSessiononnx_model_path)r   model
bnb_configer(   eps         /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/onnxruntime/transformers/models/llama/benchmark_e2e.py	get_modelrC   8   sV   888&((T[F-B-B+!*.$)',}	  J )8$($4$:$:  . ,#y"&*$7$. NF3
 
 
EE),<(,(8B(>(>D$$DO"n $ 0#'9&*j"@Dv@U@U)<)<[a   "T'((   ) ) ) >BBB,<(,(8B(>(>D$$DO"n $ 0#'9&*j"(/   "T'(( 	) 	

,..M%((E )++ {f$$ %{DN&CDD' 	
 $T%9ac`deeeLs   A+C= =
E?A.E::E?c                   | j         dk    r3t          j                    5   |di |}d d d            n# 1 swxY w Y   d }| j         dv r0| j        dk    r$t          j                            | j                   n1t          |||| j        | j	                  }|
                                 t          j                    }t          |          D ]}| j         dv rct          j                    5   |di |}| j        dk    r$t          j                            | j                   d d d            n# 1 swxY w Y   n|                    |           |                                 t          j                    }||z
  |z  }	|	|fS )Nr   r   cpu )r*   r-   no_gradr,   r   synchronizer5   r   use_fp16use_buffer_sharesynchronize_inputstimeperf_counterrangerun_with_iobindingsynchronize_outputs)
r   r>   runsinputsoutputs
io_bindingstart_endavgs
             rB   run_inferencerY   x   s
   l**]__ 	& 	&eoofooG	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& J888;%J""4#5666/vwW[Wlmm
%%''' E4[[ - -"<<< ? ?%//&//;%''J**4+=>>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
 $$Z000**,,,,



C;$
C<s   	488+8D//D3	6D3	c           
         t                       t          ||||| j        | j        | j        | j                  \  }}t          | || j        ||          \  }}||fS N)clear_cacher   r5   rI   rJ   enginerY   warmup_runs)	r   r>   config	tokenizerprompt_lengthpromptrR   rS   rV   s	            rB   prepare_model_for_inferencerc      sd    MMM4	=&$2DdmUYUjlplw OFG tUD,<fgNNJAw7?    c                 h    t          j                     t          j                                         d S r[   )gccollectr-   r   empty_cacherF   rd   rB   r\   r\      s'    JLLL	Jrd   c                    t          j        | ddddddddd	|d
z   dd|d
z   dd	| dd| dddg          }|                    |d           t                              d| d           d S )Nz
Batch SizezPrompt LengthzPrompt Processing Latency (ms)z"Prompt Processing Throughput (tps)zSampling Latency (ms)zSampling Throughput (tps)z"First Token Generated Latency (ms)z&First Token Generated Throughput (tps)Average Latency of First    z Tokens Generated (ms)Average Throughput of First z Tokens Generated (tps)zWall-Clock Latency (s)zWall-Clock Throughput (tps))columnsF)indexzResults saved in !)pd	DataFrameto_csvloggerinfo)resultsfilename
gen_lengthdfs       rB   save_resultsry      s    	,0#'04O
aOOOS:?SSSJ
JJJN:NNN$)

 
 
B( IIheI$$$
KK/H///00000rd   c                    t          j                    } |                     ddt          dg d           |                     ddt          dd	
           |                     ddddd           |                     ddddd           |                     ddt          t          j                            dd          d           |                     dt          dd           |                     dddd           |                     d d!dt          j                            dd"d#d$          d%&           |                     d'ddd(           |                     d)ddd*          f |                     d+d,d-.           |                     d/d0d1.           |                     d2d3dt          d4g d5d67           |                     d8d9t          d:d;           |                     d<d=t          t          j	        
                                rd>nd?d?d>g@           |                     dAdBt          dCD           |                     dEdFt          dGD           |                     dHdIt          dJD           |                     dKt          dLD           |                                 }t          j                            |j                   t          j        |j                   dM|j        v rKt#          |dN|j                                         dO           |j        dPk    r|j        dQ|j        if|_        |j        dMk    r|j        s
J dR            |j                            dS          |_        |j                            dS          |_        t#          |dT|j                   |j        dUv s|j        dVk    r|j        d?k    rd4ndW|_        |j        d?k    r
dX|j         n|j        }|j        dWk    rt          j        nt          j        }|j        dMk    rdMndY}t#          |dZ|           t#          |d[|           t#          |d\|           t#          |d]|j        dWk               |j        o|dMk    |_        |S )^Nz-btz--benchmark-typeT)r   r   r:   )typerequiredchoicesz-mz--model-nameFz<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))r{   r|   helpz-az--auth
store_truez5Use Hugging Face authentication token to access model)defaultactionr~   z-tz--trustzeWhether or not to allow for custom models defined on the Hugging Face Hub in their own modeling filesz-cz--cache-dir.model_cachezPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(model_name, cache_dir=cache_dir)`.)r{   r   r~   z--hf-dir-pathr   zPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(folder_path)`.z-oz--onnx-model-pathzPath to ONNX model)r|   r~   z-fz--prompts-filemodelsllamazprompts.jsonzsJSON file containing entries in the format 'prompt length: prompt' where prompt length = tokenized length of prompt)r|   r   r~   z--use_buffer_sharez3Use when GroupQueryAttention (GQA) is in ONNX modelz--anomaly-filteringzUse this flag to filter anomaly accelerator times for tokens generated.               This may give more accurate latency and throughput metrics for tokens generated.               Wall-clock metrics are still reported with anomaly times though.z-bz--batch-sizesz1 2)r   z-sz--prompt-lengthsz16 64 256 1024z-pz--precisionfp32)r   int8fp16r   zePrecision for model. For ONNX models, the model's precision should be set before running this script.)r|   r{   r   r}   r~   z-gz--generation-length   z Number of new tokens to generatez-dz--devicer   rE   )r{   r   r}   z-idz--device-idr   )r{   r   z-wz--warmup-runs   z-nz
--num-runsd   z--seedrk   r:   execution_providerExecutionProviderr%   r&   z,Please specify a path to `--onnx-model-path` r+   >   r   r   r   r   zcuda:ptr5   r   r]   rI   )argparseArgumentParseradd_argumentstrospathjoinintr-   r   is_available
parse_argsnprandomseedmanual_seedr*   setattrr,   upperr   r&   r=   batch_sizessplitprompt_lengths	precisionr.   float32rJ   )parserr   r5   r   r]   s        rB   get_argsr      s~   $&&F
111     K     D     t     S-00 D      p	     !	     S(G^DD C     B	     	!P	 	 	
 	
	 	            000t     /     *1133>     }3BBB
oCCCC
lcBBB
sA666D INN49	di    ###*t{/@/@/B/B,U,U,UVVV"&==='+'>dn@]&^D# e###SS%SSS#'--c22D-33C88D D"DN333.$44469Q9QVZVaejVjVjqw 	N 15u0D0D,DN,,,$+M#'>V#;#;%--K)U22UUFD/=111D----D(F###D*dn6777 1EfoDKrd   c                 b  9: t                      } t          d           t                              | j                   d }t          | j                  5 }t          j        |d           }d d d            n# 1 swxY w Y   t          j
        | j        dk    r| j        n| j        | j        | j        | j                  }t!          j
        | j        dk    r| j        n| j        | j        | j        | j                  }t#          |           }g }t%          j        | j        | j                  D 	]<\  }}t-          |          t-          |          }}t                              d| d|            t/                       || j        z   }	||vrOt3          t5          j        d| d	| j         d
| j         d| d| d| d| d| d| j         d| d                    ||         g|z  }
||g}	 t                              d           t9          | |||||
          \  }}t;          | || j        ||          \  }}|dz  }|||z  z  }t                              d| d           t                              d|||z  z   d           |                    ||g           t                              d           t/                       t9          | |||||
          \  }}|d                                          }|j!        d         }|j"        }tG          |d          r|j$        n|j%        |j&        z  }tO          j(        || j)        tN          j*                  }g }g }tW          j,                    }||	k    rt;          | |d||          \  }}|-                    |           tW          j,                    }|d         j!        d         dk    r|d          .                    d          dz
  }|/                    d!          0                    d|j1                  2                    |d|j1                  }tO          j3        |d         d|          4                                }n|d         d d dd d f         }tO          j5        |d!          }||z  |j6        k    }|7                    ||j6                  8                    |dg          }tW          j,                    } |-                    | |z
             tO          j9        ||gd!          }|dz  }||d<   tO          j9        |d          | :                    tN          j;                  8                    |d          gd          |d <   d"|v r<tO          j<        |d"         d!          d#         8                    |d          dz   |d"<   |d         j!        d         dk    r,|d         d d d dd d f         =                                |d<   |d         >                                 | j?        d$k    r|d%         |d%<   n| j@        st          |jB                  D ](}!|d&|! d'         |d(|! d'<   |d&|! d)         |d(|! d)<   )|d          j!        d         }"t          |jB                  D ]}!tO          j(        |||"|| j)        | jC                  }#tO          j(        |||"|| j)        | jC                  }$|D                    d&|! d'|#=                                d&|! d)|$=                                i           ||	k    tW          j,                    }%|E                    d#           | jF        rzd*9t          |          :t          |          }&t          t          9:fd+|                    }t          |          }'t                              d,|&|'z
   d-9 d.:dz   d/           t]          |          t          |          z  }(|(dz  })|d|(z  z  }*t                              d0|) d           t                              d1|* d           |d#         }+|+dz  },|d|+z  z  }-t                              d2|, d           t                              d3|- d           | j        d4z  }.t]          |d |.                   t          |d |.                   z  }/|/dz  }0|d|/z  z  }1t                              d5|. d6|0 d           t                              d7|. d6|1 d           t]          |          t          |          z  }2|2dz  }3|d|2z  z  }4t                              d5| j         d6|3 d           t                              d7| j         d6|4 d           |%|z
  }5||| j        z   |5z  z  }6t                              d8|5 d9           t                              d:||| j        z   |5z  z   d           t                              d;           |                    |)|*|,|-|0|1|3|4|5|6g
           |-                    |           	# t          $ r.}7t                              d<| d| d=|7            Y d }7~7	6d }7~7ww xY wd>| j?         d?t          jL        M                                d@dA}8t          ||8| j                   d S )BNFc                >    d |                                  D             S )Nc                4    i | ]\  }}t          |          |S rF   )r   ).0kvs      rB   
<dictcomp>z*main.<locals>.<lambda>.<locals>.<dictcomp>j  s$    <]<]<]41aSVVQ<]<]<]rd   )items)ds    rB   <lambda>zmain.<locals>.<lambda>j  s"    <]<]STSZSZS\S\<]<]<] rd   )object_hookr   )r   r   r   zRunning batch size = z, prompt length = z2
                                A prompt of size z was not found in 'zv'. There are a couple of solutions to fix this.
                                1) You can change one of the keys in 'z' to be z).
                                    If za < actual prompt's length, the benchmark E2E tool will repeat the first word in the prompt until zB = actual prompt's length.
                                    If zm > actual prompt's length, the benchmark E2E tool will automatically trim the actual prompt's length so that zd = actual prompt's length.
                                2) You can add a new key-value entry in 'z' of the form 'z,': 'your prompt goes here'.
                zMeasuring prompt processing...i  z&Average Latency of Prompt Processing: z msz)Average Throughput of Prompt Processing: z tpszMeasuring token generation...	input_idshead_dim)r,   dtype   logitsattention_mask)dimposition_idsr   r   past_key_valueszpresent.z.keyzpast_key_values.z.value
   c                    | z  k     S r[   rF   )acc_timeanomaly_threshold_factor
min_time_ss    rB   r   zmain.<locals>.<lambda>  s    H7OR\7\,\ rd   zFiltered out z$ anomaly accelerator times that are zx greater than z ms...zAverage Latency of Sampling: z Average Throughput of Sampling: z"Latency of First Token Generated: z%Throughput of First Token Generated: rk   rj   z Tokens Generated: rl   zWall-Clock Latency: z szWall-Clock Throughput: zAdding results to CSVz$Could not benchmark at batch size = z - 
benchmark__e2e_z%Y-%m-%d_%H:%M:%Sz.csv)Or   r   rs   rt   __dict__openprompts_filejsonloadr   r/   r0   r1   r   r2   r3   r	   rC   	itertoolsproductr   r   r   r\   generation_lengthNotImplementedErrortextwrapdedentrc   rY   num_runsextendcloneshapenum_key_value_headshasattrr   hidden_sizenum_attention_headsr-   zerosr5   boolrL   rM   appendsum	unsqueezerepeat
vocab_sizeviewgathersqueezeargmaxeos_token_idmasked_fillreshapecatr4   int64max
contiguouszero_r]   rJ   rN   num_hidden_layersr   updatepopanomaly_filteringminlenlistfilterr6   datetimenowry   );r   size_to_promptfr_   r`   r>   all_csv_metrics
batch_sizera   
max_lengthrb   csv_metricsrR   rS   accelerator_prompt_latency_saccelerator_prompt_latency_msaccelerator_prompt_thrptall_token_idscurrent_length	num_heads	head_sizehas_eosaccelerator_timessampling_timeswall_clock_start_timeaccelerator_time_latency_ssampling_start_timeprompt_end_indicesidxsnext_token_logitsnext_tokenstokens_to_addsampling_end_timeinew_sequence_lengthpresent_keypresent_valuewall_clock_end_time	orig_sizenew_sizeavg_sampling_latency_savg_sampling_latency_msavg_sampling_thrptfirst_token_latency_sfirst_token_latency_msfirst_token_thrpthalfwayhalfway_token_latency_shalfway_token_latency_mshalfway_token_thrptall_token_latency_sall_token_latency_msall_token_thrptwall_clock_latency_swall_clock_thrptr@   rv   r   r   s;                                                            @@rB   mainr"  b  sV   ::D
KK N	d	 	  _A12]2]^^^_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ' ,22.y*	  F - ,22.y*	  I dOOEO%.%6t7GI\%]%] ]t ]t!
M$'
OOS5G5GM
YJYY-YYZZZ"T%;;
..%2? TXTe GKGX bo  )6  Yf  )6	  er	 
 KOJ[ 
 mz   
 
 
 !/0:=!=1H	tKK89999$vyZgioppOFG4A$t}^dfm4n4n1(' -I4,O)'1]Ea5a'b$KKcA^cccdddKK}J-ZvJv<w}}}    =?WXYYY KK7888MMM9$vyZgioppOFG";/5577M*04N2I#*6:#>#>tFDVZ`ZtDt  k*T5GuzZZZG !#N$($5$7$7! J..6CD%QRTZ\c6d6d3*G!(()CDDD '+&7&9&9#8$*1-11)/0@)A)E)Ea)H)H1)L&*444;;6#455j!V->?? 
 ).WX5F4(P(P(X(X(Z(Z%%(/(9!!!R((C%#l+<"EEE "K/93II !, 7 7AW X X ` `blnoap q q$($5$7$7!%%&7:M&MNNN %	=-*Hb Q Q Q!# '4{#+09,-}}U[/I/I/Q/QR\^_/`/`acd, ,'( "V++-2Yvn7MST-U-U-UVW-X-`-`akmn-o-ors-sF>* 8$*1-22(/(9!!!RaR((C(N(N(P(PGH%!''))) ;$&&078I0JF,--. "6#;<< ] ]=DEWPQEWEWEW=X9!999:?FG[RSG[G[G[?\;!;;;<<*01A*B*H*K'"6#;<<  &+k&%/%#'#5"&"2' ' ' ).&%/%#'#5"&"2) ) )   21 2 2 2K4J4J4L4L 41 4 4 4m6N6N6P6P   Q !J..^ #'"3"5"5 !!!$$$% 
+-( !233
 122	$(\\\\\^opp% %! 011 aI$8  a  a^v  a  a  HR  UY  HY  a  a  a   &)%8%83~;N;N%N"&<t&C#!+q3I/I!JKKT8OTTTUUUKKS;MSSSTTT %6a$8!%:T%A" *a2G.G HKKX=SXXXYYYKKW@QWWWXXX ,1G&)*;HWH*E&F&FM^_g`g_gMhIiIi&i#'>'E$",4K0K"LKKmGmmPhmmmnnnKKlwllSflllmmm #&&7"8"83?P;Q;Q"Q#6#= (A0C,CDOKKpD,BppWkppp   KKwt7Mwwbqwwwxxx $79N#N )md>T.TXl-lmKKG/CGGGHHHKK~*I_9_cw8w*x~~~  
 KK/000+&*%,'(#($   "";//// 	t 	t 	tKKrzrr]jrroprrssssssss	t ^DK]]h.?.C.C.E.E]]]]H(D,BCCCCCs+   A99A= A=0ah33
i+=#i&&i+__main__)r   r   )&
__future__r   r   r   rf   r   r   loggingr   r   rL   numpyr   pandasrp   r-   benchmark_helperr   llama_inputsr   r   transformersr   r   r	   r
   onnxruntimer:   	getLogger__name__rs   rC   rY   rc   r\   ry   r   r"  rF   rd   rB   <module>r.     s  @ # " " " " "   				       				            ) ) ) ) ) ) S S S S S S S S \ \ \ \ \ \ \ \ \ \ \ \    		8	$	$= = = =@  >    
1 1 12a a aHzD zD zDz zDFFFFF rd   