
    Pi?              3          d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dlZd dl	Zd dl
Zd dlZd dlmZmZmZ d dlmZ d dlmZ dej        j        _        ej        j                            d	            G d
 d          Zd Zd Zej                                        rdnej                                        rdndZ ee           j!        j!        "                                Z#e j$        %                     e&e#                     d dl'm(Z(m)Z) d dl*m+Z+ d Z,dde-dee.         fdZ/dde-dee.         fdZ0de(dej1        dej1        dej1        fda2de(dej1        dej1        deej1        ej1        f         fda3d fde(d ej1        dej1        d!e.fd"Z4d# Z5 ej6                    d$ dddddddd%de(d&ej1        d'e.d(e.d)e7d*e7d+ee.         d,e7d-eej        j8                 d.eej        j8                 d/eej        j8                 d0eej        j8                 dej1        fd1            Z9d	efd2Z:d3 Z;d4\  Z<Z=dd5ddd6d7d8d9d: ed;          dddddddd	dddeej>        dddfd<ee.         d&e&d=ee&         d)e7d>e.d'e.d(e.de.de-d?ed@ee&         dAee-         dBee&         d*e7d+ee.         d,e7dCe7dDe7dEe7dFee         dGee         dHee         dIee         dJe7ddf2dKZ?e@dLk    rmd dlAZA eAjB        dMN          ZCeCD                    dOe.ddPQ           eCD                    dRe&d5dSQ           eCD                    dTe&dUV           eCD                    dWdXdYZ           eCD                    d[e.d6d\Q           eCD                    d]e.d9d^Q           eCD                    d_e.d8d`Q           eCD                    dae.d9dbQ           eCD                    dce-d:ddQ           eCD                    dee edf          dgQ           eCD                    dhdie&djV           eCD                    dke-ddlQ           eCD                    dmdne&doV           eCD                    dpdXdqZ           eCD                    dre.ddsQ           eCD                    dtdXduZ           eCD                    dvdXdwZ           eCD                    dxdXdyZ           eCD                    dzdXd{Z           eCD                    d|edd}Q           eCD                    d~eddQ           eCD                    de&edQ           eCD                    dd ej>        dQ           eCD                    deddQ           eCD                    deddQ           eCD                    ddXdZ           eCE                                ZF eGeF            e?eFjH        eFjI        eFjJ        eFjK        eFjL        eFjM        eFjN        eFjO        eFjP        eFjQ        eFjR        eFjS        eFjT        eFjU        eFjV        eFjW        eFjX        eFjY        eFjZ        eFj[        eFj\        eFj]        eFj^        eFj_        eFj`        eFja                   dS dS )    N)datetime)Path)OptionalTuple)get_arch_namewrite_json_result_localwrite_json_result_ossci)MappingType)get_model_size_in_bytesFTc                        e Zd Zd Zd Zd ZdS )	HostEventc                     d | _         d S N)
event_timeselfs    r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/_models/llama/generate.py__init__zHostEvent.__init__   s        c                 6    t          j                    | _        d S r   )timeperf_counterr   r   s    r   recordzHostEvent.record!   s    +--r   c                 l    | j         t          d          t          |j         | j         z
            dz  S )NzEvent not recorded!  )r   
ValueErrorabs)r   other_events     r   elapsed_timezHostEvent.elapsed_time$   s6    ?"2333;)DO;<<tCCr   N)__name__
__module____qualname__r   r   r    r   r   r   r      sF          . . .D D D D Dr   r   c                     d| v r t           j                            d          S d| v r t           j                            d          S d| v sd| v rt	                      S t          d|  d           d S )	NcudaT)enable_timingxpucpumpsdevice= is not yet suppported)torchr%   Eventr'   r   printdevices    r   device_timerr1   +   s    zd333	&yT222
6//u{{666677777r   c                     d| v r!t           j                            |            d S d| v r!t           j                            |            d S d| v sd| v rd S t	          d|  d           d S )Nr%   r'   r(   r)   r*   r+   )r,   r%   synchronizer'   r.   r/   s    r   device_syncr4   6   s    
v&&&&&	&	f%%%%%
6//u666677777r   r%   r'   r(   )Transformerprepare_inputs_for_model)get_tokenizerc                     t          j        |                               d          }t          j        | |z  dd                              t           j                  S )N   T)dimkeepdim)dtype)r,   
empty_likeexponential_argmaxtoint)
probs_sortqs     r   multinomial_sample_one_no_syncrE   Q   sP     	$$11!44A<
QB===@@uy@QQQr         ?temperaturetop_kc           	         | t          |d          z  } |t          j        | t          ||                     d                              \  }}|                    dd                              d          }t          j        | |k     t          d           |           } t          j	        j
                            | d          }|S )Ngh㈵>r:   Infr;   )maxr,   topkminsizeselect	unsqueezewherefloatnn
functionalsoftmax)logitsrG   rH   v_pivotprobss          r   logits_to_probsr\   X   s    c+t,,,Fz&#eV[[__"="=>>1R  **2..Ve^eEll]FCCH''B'77ELr   c                 ^    t          | d d df         ||          }t          |          }||fS )Nr:   )r\   rE   )rW   rG   rH   r[   idx_nexts        r   sampler_   c   s6    F111b5M;>>E-e44HU?r   modelx	input_posreturnc                 @     | ||          }t          |fi |d         S )Nr   )r_   r`   ra   rb   sampling_kwargsrW   s        r   prefillrg   i   s0     U1i  F&,,O,,Q//r   c                 Z    |j         d         dk    sJ  | ||          }t          |fi |S )Nr:   r9   )shaper_   re   s        r   decode_one_tokenrj   q   sC     ?2!####U1i  F&,,O,,,r   c                     | S r   r#   )rY   s    r   <lambda>rl          q r   	cur_tokennum_new_tokensc                    g g }}t          |          D ]}t          j        j                            t          j        j        j        j                  5  t          | ||fi |\  }	}
|	                                |
                                }
}	|dz  }|	                    |	                                            ||d                    |	                    |
           |	}d d d            n# 1 swxY w Y   ||fS )Nr9   r:   )
ranger,   rT   	attentionsdpa_kernel
SDPBackendMATHrj   cloneappend)r`   rn   rb   ro   callbackrf   
new_tokens	new_probsi
next_token	next_probs              r   decode_n_tokensr~   z   sJ    	J>"" # #X++EH,>,I,NOO 
	# 
	#$4y)% %/>% %!J	 %/$4$4$6$6	8I8I	JNIj..00111HZ^$$$Y'''"I
	# 
	# 
	# 
	# 
	# 
	# 
	# 
	# 
	# 
	# 
	# 
	# 
	# 
	# 
	# y  s   BC..C2	5C2	c                      | ||          S r   r#   )r`   ra   rb   s      r   model_forwardr      s    5Ir   c                     | S r   r#   ra   s    r   rl   rl      rm   r   )rx   kv_cache_quantization
cache_sizelinear_causal_maskprefill_start_eventprefill_end_eventdecode_start_eventdecode_end_eventpromptmax_new_tokens
batch_sizeinteractiver   r   r   r   r   r   r   c       	         F   |j         }|                    d          }|st          ||z   | j        j                  nd}t          d| d|            ||z
  }t          |          \  }}|                    |d          }t          j	        |||j
        |          }||ddd|f<   t          j         |          5  ||}||k    s
J d            |                     |||||	           ddd           n# 1 swxY w Y   |	|	                                 t          | |                    |d          |fi |                                }|                                |dd|f<   |
|
                                 ||                                 t          j        |g|t          j        
          }t'          | |                    |d          ||dz
  fd|i|\  }}t          j        |ddd|dz   f         g|R d          }||                                 |S )zp
    Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
    r:   i^  zmax_seq_length=z, prompt_length=r9   r=   r0   NzBneed cache_size to be greater than max_new_tokens + size-of-prompt)max_batch_sizemax_seq_lengthr   r   prompt_lengthr0   r=   rx   rK   )r0   rO   rN   config
block_sizer.   r6   repeatr,   emptyr=   setup_cachesr   rg   viewrv   squeezetensorrB   r~   cat)r`   r   r   r   r   rx   r   r   r   r   r   r   r   rf   r0   Tr   ry   rb   seqr|   generated_tokensrY   s                          r   generater      s   . ]FBA ALTA 7888QT  

?N
?
?A
?
?@@@!#J 188FI]]:q))F +j.V
T
T
TCC2A2J 
f		 
 
'J^+++P ,++ 	%%"71 	 	
 	
 	

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 &""$$$v{{:r**I 9H egg  ""$$C1I$  """ %!!###aSuyAAAI)
B''Q	 
   a )SGa!eG_8'788b
A
A
AC#!!!Js   </C77C;>C;c                     |                      |          }|r|                                 g|z   }t          j        |t          j        |          S )Nr   )encodebos_idr,   r   rB   )	tokenizerstringbosr0   tokenss        r   encode_tokensr      sO    f%%F
 /""$$%.<ei????r   c                    t          j        t          |           ddd          }d|v rdt          |           v r|d         }t          j        d          5  t	          j        |           }d d d            n# 1 swxY w Y   |                    |d           |                    ||          }|                                S )	NTr(   )mmapweights_onlymap_locationr`   storiesmeta)assignr   )	r,   loadstrr0   r5   	from_nameload_state_dictrA   eval)checkpoint_pathr0   	precision
checkpointr`   s        r   _load_modelr      s   O4d  J *c/.B.B!B!B(
	f		 7 7%o667 7 7 7 7 7 7 7 7 7 7 7 7 7 7	*T222HHF)H44E::<<s   A88A<?A<)z[INST]z[/INST]zHello, my name is   d   r9      g?z?checkpoints/meta-Transformer/Transformer-2-7b-chat-hf/model.pthprefill_sizedemo_summarize_promptnum_samplesr   quantizationmin_sqnrsparsitysavecompilecompile_prefillprofilememory_profilewrite_resultoutput_json_pathoutput_json_localc                 2   | Z| dk    rT|dt          |           dz
  z  }n<t          |d          5 }|                                }ddd           n# 1 swxY w Y   t          j        j                                         |	                                s
J |	            |	j        dz  }|                                sJ t          |                      t          d|            dt          |	          v }t          d	           t          j                    }t          |	||          }t          |
           t          dt          j                    |z
  dd           t          ||	          t          |d|          }|Kt          dd|          } |d| |                     d          z
           }t#          j        || fd          }|                    d          }!t#          j        d           d fd}"d }#|
rkddlm}$m}%m}&m}'m}(m})m}*m}+m},m}-m}.m }/m!}0 ddl"m#}1m$}2 d|
v rddl%m&}3  |3|           |
'                    d          r&ddl(}4ddl)}5ddl*}6|6+                    d           d|5,                    |4-                                          j.         d}7|
/                    d          }8t          |8d                    }9|8d         d!k    rdnt          |8d                   }:|8d"         d#k    rd$nd%}; |0| |(|9|:|;&                     |60                    |7           t          d'           tc          |t          |d|          ||d||(           |62                    |7           d)|
v r |0| |-                       d*|
v rr|rAd+|v r=dd,l3m4}<  |0| |, |<            -          .            |0| |,            |".           n/d/|
v r |0| |,d0                     n |0| |,                       d1|
v rZd}=d2|
v rd}=t          |
/                    d          d                    }:|:d3v sJ d4|:              |0| |*|:|=d 5                     nd6|
v rdd7l3m5}> t          |
6                    d6                    }?|?d8k    s|?d9k    sJ |?d8k    r5 |0| |)tn          j8        tn          j8         |>            :                     n;|?d9k    r5 |0| |+dtn          j8        tn          j8         |>            ;                     d<|
v rkd=|
v r<dd>l9m:}@  |0| |+d?tn          j8        tn          j8         |@            ;                     n+d+|v r'dd@l3m;}A  |0| |* |A            d A          |#.           dB|
v r |0| |'d"d                     	ndC|
v r |0| |-dDE          dF .           	n|
'                    dG          rddHl<m=}B ddIl>m?m@}CmA}D |
/                    d          d          }Et          |
/                    d          d                   }:t          t"          |Et"          jC                  }E|D                    |          } |D|d dJ|E|:K            |B|D                    |          dJ|F|L          E                    dMgd N           fdO}Gd2|
v }= |0| |C|E|:|=P          |G           n{dQ|
v rd2|
v rd}=nd}=|
/                    d          }8t          |8d                    }?|?d k    r|?d9k    s
J dR            t"          jF        t"          jG        t"          jH        t"          jI        t"          jJ        t"          jK        t"          jL        t"          jC        dS}H|H|?         }It          |8d                   }: |0| |.|I|:|=T                     ndU|
v r|t"          jM        k    s
J dV            ddWl"mN}JmO}K ddXlPmQ}L |
/                    d          }8t          t"          dY|8d                     }Mt          |8d                   }:|:dk    r |K|:          n
 |Jd          }Nt          |8d"         S                                dZk              }O |0| |L|M|N|Ortn          jT        ntn          j8        d[\                     nd]|
v r |0| |&                       nd^|
v r|rd+|v r |0| |$            .           nrt          |
/                    d          d_                   }N|Nd`k    r |2            }Nn|Ndak    r |1            }Nn
 |2            }N |0| |%|Nb                     ndc|
v rVdddl<mU}P ddelVmW}F ddflXmY}Q dJ}R |P|R|Fd|jZ        j[        dg
          \                    dMgd           ]                                d         j^        d         }S |F|S          }St#          j_        dg          5  |`                    d |Rh           ddd           n# 1 swxY w Y   di|
k    r+ |Q|dt          ja        j        jY        jb        |S|Rj          }ndk|
k    r* |Q|dt          ja        j        jY        jc        |S|Rj          }ndl|
k    r* |Q|dt          ja        j        jY        jd        |S|Rj          }ndm|
k    rt          ja        j        jY        je        t          ja        j        jY        jb        z   t          ja        j        jY        jd        z   }Tt          j        f                                r|Tt          ja        j        jY        jc        z  }T |Q|d|T|S|Rj          }n |Q|d|S|Rn          }t          do           tc          |t          |d|          ||d||(           t          dp           |g                                 ndq|
v rdddl<mU}P ddelVmW}F dJ}R |P|R|Fd|jZ        j[        dg
          \                    dMgd           ]                                d         j^        d         }S |F|S          }St#          j_        dg          5  |`                    d |Rh           ddd           n# 1 swxY w Y   dr|
k    r! |/|dt          j        jb        |S|s          }n{dt|
k    r! |/|dt          j        jc        |S|s          }nTdu|
k    r! |/|dt          j        jd        |S|s          }n-dv|
k    r! |/|dt          j        jh        |S|s          }ndw|
k    rqddl(}4ddl)}5ddxlimj}U |U0                    d|5,                    |4-                                          j.         d            |/|dt          j        jk        |S|s          }ndy|
k    rz	 ddl(}4ddl)}5ddxlimj}U |U0                    d|5,                    |4-                                          j.         d           n#  Y nxY w |/|dt          j        jl        |S|s          }n |/|d|S|z          }tc          |t          |d|          ||d||(           |g                                 nd{|
v r<dd|lmmn}V |D                    |            |0| |Vt"          jI        dD}                     n|rdd~lomp}Wmq}X d+|v r) |X|D                    |           |W            .           d|v rddlomrms}Y |/                    d          \  }Zt                    t                    c |X|fd.           t          |            |X|ju        .           t          |            |X| |Y          .           t          |d          dz  }[|rt          |	w                                          }\t          |	jx                  /                    d          d         }]t#          jy        |z                                |4j{        |                    |\|]d|
 dz                        |rIt          d           t#          j}        t          dd          a~|rt#          j}        t          dd          a|rk|dgk    r(t"          j        j                            ddd           n=|dk    r(t"          j        j                            ddd           nt          d           g g g g d}^|rd_nd}_t	          |_|          D ]}`|`dk    rI|dgk    rt"          j                                         n$|dk    rt"          j                                         t          |
           |`dk    rO|rMt          d          }|r(t           d|                                 dt           }t          |d|          }|r0|`dk    r*| (g                     d          d         dfd}an0|+|`dk    r%g                     d          d         fd}and }at          j                    }t          |          t          |          }c}bt          |          t          |          }e}dddl}f|`|d z
  k    s|s|f                                }gnAt"          j        j                                         t"          j                                        }g|g5  tc          ||||||a||||||b|c|d|e          }hddd           n# 1 swxY w Y   |`dk     r*t          dt          j                    |z
  dd           9t'          |gd          r|g                    | d           t          |
           t          j                    |z
  }i|s|| |hd                                         }j                                |jvr|jn.|jd|j                                                                       }kt                              |k                     nt          d           |h                    d_          |!z
  }l|l|iz  }m|^d                             |m           |^d                             |i           |d                    |e          dz  }n|l|nz  }o|^d                             |o           |b                    |c          dz  }p|^d                             |p           t          d|`d z    d|idd|mddd|pdd|odd           t          d|[|mz  dd           |r|`dk    r|dgk    r$t"          j        j                                        }qn9|dk    r$t"          j        j                                        }qnt          d           t          | dd          5 }ddlm}r  |r|q|           ddd           n# 1 swxY w Y   t          d| dd            nt          d           t#          j        t#          j        |^d                                                             }st#          j        t#          j        |^d                                                             }tt#          j        t#          j        |^d                                                             }u|[|sz  }vt"          j                                        dz  }wt          d|sd           t          d|udd           t          d|tdd           |dgk    r"t"          j                                        dz  }wn'|dk    r!t"          j                                        dz  }wt          d|sd           |d k    rt          d||sz  d           t          d|vdd           t          d|wdd           t          d|[dd           |rjdtE          j                                        d           d|sdd|udd|tdd|vdd|wdÛd|[dÛdŝ}x|xd|
 d| d|	j        jx         d| d| d| d| d| dz  }x|xdz  }x|x|
rd|
 dndz  }x|x|rd| dndz  }x|xd|	 dz  }x|xd| dz  }x|xd| dz  }x|x|rdndz  }x|x|rdndz  }x|x| rd|  ndz  }x|x|rd| dndz  }x|x|rd| dndz  }x|x|rdndz  }x|xd| dz  }x|xd| dz  }x|xd| dz  }x|xd| dz  }x|xd| dz  }x|x|rd| ndz  }x|x|rdndz  }x|x|rdndz  }xt          |d          }|                    |x           |                                 |rgg d}y|	j        jx        }ztO                      }{|
pd}I|z|I||||{d|vdg	}||z|I||||{d|sdg	}}|rtP          ntR          }~ |~||y||            |~||y|}           dS dS )zNGenerates text samples based on a pre-trained Transformer model and tokenizer.Nr   zprompt    rztokenizer.modelzUsing device=chatzLoading model ...r/   zTime to load model: z.02fz secondsT)r   r0   z
 <END_TEXT>FrK   i  c                 H    t          | t          j        j                  od|v S )Nfeed_forward
isinstancer,   rT   Linearmodfqns     r   ffn_onlyzmain.<locals>.ffn_onlyI  s     #ux//INc4IIr   c                 \    t          | t          j        j                  o | |           S r   r   )r   r   r   s     r   not_ffn_onlyzmain.<locals>.not_ffn_onlyL  s+    #ux//Jc8J8J4JJr   c                 P    t          | t          j        j                  od|v pd|v S )Nr   rr   r   r   s     r   ffn_or_attn_onlyzmain.<locals>.ffn_or_attn_onlyO  s/    #ux// 
c!7[C%7	
r   )3Float8DynamicActivationFloat8SemiSparseWeightConfig)Float8DynamicActivationFloat8WeightConfigFloat8WeightOnlyConfigFPXWeightOnlyConfigGemliteUIntXWeightOnlyConfig%Int4DynamicActivationInt4WeightConfigInt4WeightOnlyConfig%Int8DynamicActivationInt4WeightConfig%Int8DynamicActivationInt8WeightConfigInt8WeightOnlyConfigUIntXWeightOnlyConfig	autoquant	quantize_)PerRow	PerTensor	spinquant)apply_spinquantgemliterL   z/tmp/z_gemlite.json-r9   None   dqdynamicweight_only)	bit_width
group_sizemodezrunning gemlite warmup)r   rG   rH   int8woint8dqsemi)SemiSparseLayout)layout)	filter_fnint8dq_prefill_wo_decode)weight_only_decodeint4wohqq)    @         z=int4wo group_size needs to be one of [32,64,128,256] but got )r   use_hqqversionzint4dq-)CutlassInt4PackedLayout      )mapping_typeact_mapping_typer   )r   r	  r
  r   marlinqqq)MarlinQQQLayoutr  )MarlinSparseLayout)r   r  fp6zembed-int8wor  )r   c                 @    t          | t          j        j                  S r   )r   r,   rT   	Embedding)ra   argss     r   rl   zmain.<locals>.<lambda>  s    :a9K+L+L r   awq)TransformerEvalWrapper)AWQObservedLinear	awq_uintxinsert_awq_observer_r  )quant_dtyper   )r`   r   r   input_prep_funcr0   wikitext)taskslimitc                 $    t          |           S r   )r   )mr   r  s     r   rl   zmain.<locals>.<lambda>  s    
1>O0P0P r   )r  r   r  uintxznbits must be 1 to 8)r9   r   r   r  r         r  )r  #int8_dynamic_activation_intx_weightzJint8_dynamic_activation_intx_weight requires using precision=torch.float32)PerAxisPerGroup)%Int8DynamicActivationIntxWeightConfigrB   trueopaque_torchao_auto)weight_dtypeweight_granularityweight_mapping_typeintx_packing_formatfloat8wofloat8dqr:   r   row)granularityautoquant_v2)LMEvalInputRecorder)r6   )r0  r%   )r   r   zautoquant_v2-int4)manualqtensor_class_listexample_inputr   zautoquant_v2-float8zautoquant_v2-fpzautoquant_v2-all)r2  r4  r   zrunning generatezrunning finalize autoquantr   zautoquant-int4)r2  r3  r4  r   zautoquant-float8zautoquant-fpzautoquant-sparsezautoquant-gemlite-int4)GemLiteLinearTritonzautoquant-all)r2  r4  r   codebook)codebook_weight_only)r=   scale_block_size)semi_sparse_weight	sparsify_bsr)SupermaskLinearblock_sparse_weightc                 4                         |           S )N)sparsity_level	blocksize)from_linear)ra   r<  r@  r?  s    r   rl   zmain.<locals>.<lambda>6  s%    /55#1' 6   r   )r@  )ignore_embeddingsg    eA.z.ptzCompiling Modelzreduce-overhead)r   	fullgraph)rD  r   i )trace_alloc_max_entriestrace_alloc_record_contextr'   z2Memory profiling only works on CUDA or XPU devices)tokens_per_secr   decode_tokens_per_secprefill_timezWhat is your prompt?  c                    rd S                                          g|                     d                                          z             dd                     |                                                                 k    rdt                    dk    sr;t          d                              dd           	                                 d S d S )Nr   r9   Tr   endflush)
rw   decoder   tolistitemeos_idlenr.   joinclear)ra   bufferdone_generating	period_idr   s    r   rx   zmain.<locals>.callback  s    " Fi..	{QYYq\\=P=P=R=R/RSSTUTVTVWXXX6688y//1111&*Ov;;!###"''&//r>>>>LLNNNNN $#r   c                 V                                            g|                     d                                          z             dd                     t	                    dk    r;t          d                              dd                                            d S d S )Nr   r9   r  rL  TrM  )rw   rP  r   rQ  rT  r.   rU  rV  )ra   rW  rY  r   s    r   rx   zmain.<locals>.callback  s    i..	{QYYq\\=P=P=R=R/RSSTUTVTVWXXXv;;!##"''&//r>>>>LLNNNNN $#r   c                     | S r   r#   r   s    r   rl   zmain.<locals>.<lambda>  s     r   )r   rx   rG   rH   r   r   r   r   r   r   r   zCompilation time: z.2fexport_chrome_tracez.json
rG  r   r   rH  rI  zSample z | overall time z.04fz s z tokens/secz| prefill time z
 s decode zBandwidth achieved: z GB/sz.picklewb)dumpz
memory profile z4.pickle saved, to convert that to a usable file, usez_python pytorch/torch/cuda/_memory_viz.py trace_plot <pickle file> -o <desired output name>.htmlz
==========zAverage overall tokens/sec: zAverage decode tokens/sec: z szAverage TTFT: zAverage tokens/sec: z%Average tokens/sec including batches zAverage Bandwidth: zPeak Memory Usage: z GBzModel Size: z%Y%m%d%H%M%Sz, tok/s=z6.2fz, tok/s_decode=z, ttft=z5.4fz, mem/s=z7.2fz GB/s, peak_mem=z5.2fz GB, model_size=z GB zquant: z
, sparse: z, mod: z, kv_quant: z, compile: z, compile_prefill: z	, dtype: z
, device: zrepro: python generate.py z--quantization rL  z--sparsity z--checkpoint_path z	--device z--precision z
--compile z--compile_prefill z--prefill_size z
--profile z--interactive z--num_samples z--max_new_tokens z--batch_size z--top_k z--temperature z--cache_size z--kv_cache_quantization z--linear_causal_mask a)	namer=   r   r   r0   archmetricactualtargetnoquantzmem/sztok/s)rB   openreadtorchaor   utils"recommended_inductor_config_setteris_fileparentr   r.   r   r   r4   r7   r   rO   r,   r   manual_seedtorchao.quantizationr   r   r   r   r   r   r   r   r   r   r   r   r    torchao.quantization.granularityr   r   torchao.prototype.spinquantr   
startswithospwdr   set_autotunegetpwuidgetuidpw_gecossplitload_configr   cache_configtorchao.dtypesr   r  removeprefixr
   	SYMMETRICtorchao.prototype.dtypesr  r  torchao._models._evalr  torchao.prototype.awqr  r  r  getattruint8rA   run_evaluint1uint2uint3uint4uint5uint6uint7float32r#  r$  torchao.quantization.quant_apir%  boollower
ASYMMETRICr1  torchao._models.llama.modelr6   +torchao.prototype.quantization.autoquant_v2r0  r   
vocab_sizerecord_inputsget_recorded_inputsvaluesr0   r   	prototype!DEFAULT_INT4_AUTOQUANT_CLASS_LISTOTHER_AUTOQUANT_CLASS_LIST"DEFAULT_FLOAT_AUTOQUANT_CLASS_LISTDEFAULT_AUTOQUANT_CLASS_LISTis_sm_89finalize_autoquant#DEFAULT_SPARSE_AUTOQUANT_CLASS_LISTgemlite.corer5  !GEMLITE_INT4_AUTOQUANT_CLASS_LISTALL_AUTOQUANT_CLASS_LIST'torchao.prototype.quantization.codebookr7  torchao.sparsityr9  r:  r<  r=  rS   	to_linearr   cwdra  r   
state_dictpathrU  r   rj   rg   r%   memory_record_memory_historyr'   rq   reset_peak_memory_statsinputB_INSTstripE_INSTr   r   r1   
contextlibnullcontextprofiler_utils_init_for_cuda_graphsr   hasattrr\  rQ  rS  indexrP  rw   r   	_snapshotpickler_  meanr   rR  max_memory_reservedr   todaystrftimewritecloser   r   r	   )r   r   r   r   r   r   r   rH   rG   r   r   r   r   r   r   r   r   r   r   r   r   r0   r   r   r   r   ftokenizer_pathis_chatt0r`   encodedend_tagr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rs  rt  r   config_file_quant_argsr   r   r   r   r  r  nbitsr  r  r  r  r  r  r6   is_observed_linear_NBITS_TO_DTYPEr=   r#  r$  r%  r(  r/  is_asymmetricr1  r0  calibration_seq_lengthinputsall_qtensor_classesr5  r7  r9  r:  r=  rY   
model_size
output_dirfilenameaggregate_metricsstartr{   rx   r   r   r   r   r  profyttok_listr   tokens_generated
tokens_secdecode_timedecode_tokens_secrI  snapshotr_  	tokpersecttftdecode_tokpersec	bandwidthmem
result_txtheadersra  rb  memory_resultperformance_resultwrite_json_resultr  r<  r@  rW  rX  r   rY  r?  r   s                                                                                                                                  @@@@@@@@@r   mainr    s$   @ L1$4$4 (#l"3"3a"78FF+S11 "Q" " " " " " " " " " " " " " " AACCC""$$55o55$$+.??N!!##88S%8%888#	
"&
"
"###O,,,G	
	B;;Ev	
@r!1
@
@
@
@AAAno>>IIv4GGGG(	?fUUU:L7<<??::;)Wg.A666LLOOM	dJ J JK K K K K
 
 

  t	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	GFFFFFFF,&&CCCCCCOE"""""9-- !	.IIIJJJNNN  '''S#,,ryy{{";";"DSSSK&,,S11KKN++I!,Q6!9!9s;q>?R?RJ +A$ 6 699MDI,,'JT     ,,,*+++iT&III!'      ---|##Ie1133444|## JFh..;;;;;;	99AQAQASASTTT&   
 	99;;*    
 ,|;;	99TRRR   
 	%!F!F!H!HIII|##G$$\//44Q788J "    ]PZ\\   I$$
GUVWWW    ,&&>>>>>>11)<<==EA::!+zz	99%0%:)4)>6688      !	99#'%0%:)4)>6688	     |##$$DDDDDD	99#&%0%:)4)>.00	      8##======	((0B0B0D0DaPPP.   
 L  Ie00A667777|++I$$333LL    
 $$U++ z	DDDDDD          ',,S11!4K\//44Q788J!%ekBBKHHV$$E  q#;:    #"hhv&&#" 8   h!l     "Q!P!P!P|+GI	 +
G   #    $$$$&,,S11KA''EA::%1***.D**,;;;;;;;;	 	O $E*E[^,,JIe225*gVVVWWWW2lBB---\ .-- KJJJJJJJ     
 ',,S11K"5*@A*@*@AAL[^,,J2<q..((:...ggajjK Q!5!5!7!76!ABBMI55!-'2$)/(>(>$.(=  
 
 
 
 <''Ie33556666<'' Fh..	GGII&     ","4"4S"9"9""=>>(**"+)++KK E))"(&((KK"+)++K	==+VVV    |++AAAAAALLLLLLPPPPPP%("##*,L+!   L  %$&&q*    .-f55Ff%%  ""#$5K #                 
 #l22$'.'8'E'R't"(5   ',66$'.'8'E'R'm"(5   #l22$'.'8'E'R'u"(5   $|33%2?\'4Acd'4Ade $
 =))++ r'7+<+I+V+qq'$':"(5   %"(5	   $%%%iT&III!'    .///$$&&&&L((AAAAAALLLLLL%("##*,L+!   L  %$&&q*    .-f55Ff%%  ""#$5K #                 
  <//!	'.';']"(%   $|33!	'.';'V"(%    <//!	'.';'^"(%   $|33!	'.';'_"(%   *\99			


<<<<<<#//MCLL55>MMM   "	'.';']"(%   !L00
IIIJJJ@@@@@@'33QRYY[[ 9 9 BQQQ   D!	'.';'T"(%   "	$fx   iT&III!'    $$&&&&<''TTTTTTHHVI++%+PRSSS  
 
 !BBBBBBBBXIehhv&&(:(:(<(<QQQQHMMMMMMMM ,4>>#+>+>(A~y(-n(=(=s9~~%NII     
 #    %LLLI)"   
 %LLL I**Y???8    )$GGG#MJ 
,,..//
+,,22377:
GLLX0EL0E0E0E%EFF	
 	
 	

  
K    ="
 
 
  	KmGtTJJJG 
HVJ44fQU 5     u__I33fQU 4     FGGG !#	   BBqE5+&& } }66
2244445	113336""""66k6233F ?">>V\\^^>>f>>#Iv4OOOG 	#166l&:F!((--a0I#O	# 	# 	# 	# 	# 	# 	# 	# 	# #.166F!((--a0I# # # # # # # # #{H       /
      - 	aw))++DDN!77999>))++D 	 	'!'&;%#5$7"3#5!1  A	 	 	 	 	 	 	 	 	 	 	 	 	 	 	$ q55Mt'8':':R'?MMMMNNN4.// 	8$$%6%6%67776"""""$ 
	4<AUt{{}}H ##%%X55 By/?/?/A/A B BBC 
 )""6**++++$KKK66"::5%)
*+22:>>>&!((+++(556FGG$N,{:1299:KLLL*778IJJTQ.)00>>>Ta!eTTQTTT
TTTT^l^^^;L^^^^	
 	
 	
 	HZ*%<HHHHIII 	a1ff :,66885 9+5577JKKK000$77 "1''''''Xq!!!" " " " " " " " " " " " " " " hNhhhq   E	, 
5<(9:J(KLLMMRRTTI:el#4^#DEEFFKKMMDz&'>?@@ 
dff  Y&I
*
(
(
*
*S
0C	
8
8
8
8999	
A(8
A
A
A
ABBB	
(4
(
(
(
()))j,,..4	5i++--3	
0
0
0
0111A~~Rj96LRRRSSS	
5	
5
5
5
5666	
-
-
-
-
-...	
-
-
-
-
-...  m(.**33NCC  m  mY  m  m  ml|  m  m  m  KO  m  m  m  ^g  m  m  m  ~A  m  m  m  Xb  m  m  m  m
  i  i  i  i  iI_Id  i  i  sH  i  i  U\  i  i  q@  i  i  KT  i  i  `f  i  i  i  	i
22
<O77777RO
8C/H////C
=?====
+&++++
1Y1111
g5ll25
oE**2E
,N6666BN
@-7----b@
N4>4444BN
+=&&2=
5{5555
;.;;;;
3j3333
)))))
5{5555
jH2j222bH
4IQ00rQ
1CK--K
s##	
				 )I

 

 

 %*)	

 

 (9U##>U 	 	*G]CCC*G5GHHHHHS)I )Isr   AA"Ag<<h h -qqq6Aw wJ'AKKAK	KAK	U0AVVAV	VAV	__main__zYour CLI description.)descriptionz--prefill_sizezWhether to run in ttft mode)typedefaulthelpz--promptzInput prompt.z--demo_summarize_promptzRead prompt from text file)r  r  z--interactive
store_truez%Whether to launch in interactive mode)actionr  z--num_sampleszNumber of samples.z--max_new_tokenszMaximum number of new tokens.z--batch_sizezBatch size to benchmark withz--top_kzTop-k for sampling.z--temperaturezTemperature for sampling.z--checkpoint_pathz<../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pthzModel checkpoint path.z-qz--quantizationa  Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, int4wo-<groupsize>-hqq, autoquant, autoquant-int4, autoquant-gemlite-int4, autoquant-float8, autoquant-sparse, autoquant-all, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin, spinquant, embed-int8wo, marlin_qqq, gemlite-<pack_bitwidth>-<nbits>-<groupsize>, float8dq, int4dq-<nbits>, fbgemm-int4-<group_size>z
--min_sqnr)zNmin sqnr for quantizing v.s. not quantizing a layer, used in autoquant optionsz-sz
--sparsityz3Which sparsity techniques to apply: semi-structuredz--kv_cache_quantizationz Whether to quantize the KV cachez--cache_sizezeForce size of cache to be a certain number of tokens, if not set, will use max_new_tokens+prompt_sizez--linear_causal_maskztWhether to use the memory efficient, but slightly less fast, linear causal mask (important for long context lengths)z--savez$Whether to save the quantized model.z	--compilezWhether to compile the model.z--compile_prefillzPWhether to compile the prefill (improves prefill perf, but higher compile times)z	--profilezProfile path.z--memory_profilezfilename for memory profile.z--devicezDevice to usez--precisionc                 ^    t          t          |                     d          d                   S )NrC  r:   )r  r,   ry  r   s    r   rl   rl     s    wuaggcll2&677 r   zdtype precision to usez--write_resultzPath where to write the resultz--output_json_pathz1Path where to write the json result for dashboardz--output_json_localznWhether to output json result for local machine or for CI machine, local option will fill in some dummy fields)rF   N)bsysr   r   pathlibr   typingr   r   r,   torch._dynamo.configtorch._inductor.configri  torchao._models.utilsr   r   r	   %torchao.quantization.quant_primitivesr
   torchao.utilsr   sparseSparseSemiStructuredTensor_FORCE_CUTLASSbackendsr%   enable_cudnn_sdpr   r1   r4   is_availabler'   default_device__file__rm  resolvewdr  rw   r   r  r5   r6   torchao._models.llama.tokenizerr7   rE   rS   rB   r\   r_   Tensorrg   rj   r~   r   no_gradr  r-   r   r   r   r  r  bfloat16r  r    argparseArgumentParserparseradd_argument
parse_argsr  r.   r   r   r   r   r   r   r   rH   rG   r   r   r   r   r   r   r   r   r   r   r   r   r0   r   r   r   r   r#   r   r   <module>r	     su   


              " " " " " " " "                   
 > = = = = = 1 1 1 1 1 19> ' 6   $ $T * * *D D D D D D D D8 8 88 8 8 z  FF y
	  
T(^^!))++ B    M M M M M M M M 9 9 9 9 9 9R R R  Xc]      HSM    00 <049L0
\0 0 0 0-- <-49L-
5<%&- - - - [! !!|! |! 	! ! ! !2    ["' $$6:485937Q Q QQLQ Q 	Q Q  Q Q Q "%*"23Q  
 01Q !!12Q uz/0Q  \!Q Q Q Qh *.n @ @ @ @   % #'%+/ DI  #' $""' $$!"%)n#''+#9JI JI3-JIJI $C=JI 	JI
 JI JI JI JI JI JI 3-JI uoJI smJI   !JI" #JI$ %JI& 'JI( )JI* +JI, d^-JI. TN/JI4 4.5JI6 tn7JI8 9JI: 
;JI JI JI JIZ zOOO$X$1HIIIF
sD7T     &9     !2N     4    
 c1CWXXX
c8W     S!2P     	S?TUUU
eS7R     STT%	     J  	 	 	 
	     C	     !/    
 t	      D    
 ,R     L/N     _    
 $?SSS
t:X     n?     77%	     tT8X     @	     }     D	E$KKKD"
"	5    i r   