
    PiW                     *   d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlm	Z	 d dlm
Z
 d dlmZmZmZmZ d dlmZmZ d dlmZ  ej        d          Z G d	 d
          Zej        de	ddfd            Zedk    r ej         e                       dS dS )    N)AnyDictList)
DictConfig)nn)config
generationtrainingutils)MessageRole)FullModelTorchTuneCheckpointerDEBUGc                       e Zd ZdZdeddfdZdeddfdZdedeee	f         de
j        fd	Zd
eeef         dee         fdZ ej                    defd            ZdS )InferenceRecipea,  
    Recipe for generating tokens from a dense Transformer-based LLM.

    Currently this recipe supports single-GPU generation only. Speculative
    decoding is not supported.

    For more details on how to use this recipe for generation, please see our
    tutorial: https://pytorch.org/torchtune/main/tutorials/e2e_flow.html#generation

    For using this recipe with a quantized model, please the following section of
    the above tutorial:
    https://pytorch.org/torchtune/main/tutorials/e2e_flow.html#speeding-up-generation-using-quantization
    cfgreturnNc                 d   t          j        |j                  | _        t	          j        |j        | j                  | _        t          j	        |j
                  | _        t	          j        | j                  | _        t	          j        |j        |                    dd                      d S )N)devicedtyper   cudnn_deterministic_mode)seed
debug_mode)r   
get_devicer   _devicer
   	get_dtyper   _dtyper   instantiate	quantizer
_quantizerget_quantizer_mode_quantization_modeset_seedr   get)selfr   s     d/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/recipes/generate.py__init__zInferenceRecipe.__init__%   s    'sz:::(syNNN ,S];;"*"=do"N"Ncgg.H$&O&O	
 	
 	
 	
 	
 	
    c                    t          j        |j                  }| j        <t	          |t
                    st          d          d| j        v rt          d          | j        |                                }n|                    d          }|                     |j	        |t          j                           | _        t          j        |j                  | _        d S )NzQuantization is only supported for models quantized and saved with the FullModelTorchTuneCheckpointer - please ensure you have quantized your model and are using the quantized weights!qata  You have specified a quantizer with 'QAT' - QAT quantizers should only be used during quantization aware training and when quantizing models. Please use the corresponding post-training quantizer e.g. Int8DynActInt4WeightQuantizer for Int8DynActInt4WeightQATQuantizer.F)weights_only)	model_cfgmodel_state_dict)r   r   checkpointerr#   
isinstancer   
ValueErrorload_checkpoint_setup_modelmodelr
   	MODEL_KEY_model	tokenizer
_tokenizer)r&   r   r/   	ckpt_dicts       r'   setupzInferenceRecipe.setup/   s    )#*:;;".l,JKK  A  
 /// i   "*$4466II %44%4HHI''i&x'9: ( 
 
 !,S];;r)   r-   r.   c                    t          j        | j                  5  | j        5  t	          j        |          }d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   | j        | j                            |          }|	                    | j        | j                  }|
                                D ]"\  }}|	                    | j                  ||<   #|                    |d           n|                    |           t          j        |                                | j                   t                              d| j         d           |S )N)r   r   T)assign)r   z$Model is initialized with precision .)r
   set_default_dtyper   r   r   r   r#   r!   quantizetoitemsload_state_dictvalidate_expected_param_dtypenamed_parametersloggerinfo)r&   r-   r.   r4   kvs         r'   r3   zInferenceRecipe._setup_modelM   s   
 '44 	2 	2dl 	2 	2&y11E	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 ".O,,U33EHHDLHDDE(..00 9 91&'dd4<&8&8 ##!!"24!@@@@!!"2333 	.""$$DK	
 	
 	
 	
 	I4;IIIJJJs3   AAAA	A
A	AA!Apromptc                 &   g }d|v r2|d         *|                     t          d|d                              |                    t          d|d                   t          dd          g           |                     d|id	          d
         S )z
        Convert the prompt string to a user message with optional system messages
        and tokenize using the prompt template defined on the tokenizer.
        systemN)rolecontentuser	assistant messagesT)	inferencetokens)appendr   extendr8   )r&   rI   rQ   s      r'   convert_prompt_to_tokensz(InferenceRecipe.convert_prompt_to_tokensf   s     v&"2">OOG6(;KLLLMMMVVF^<<<["555	
 	
 	
 
H5FFxPPr)   c           
         |                      |j                  }t          j        |t          j        | j                  }d }|j        r[| j        5  | j                            d| j	        |
                                |j        z              d d d            n# 1 swxY w Y   | j        t                              d           t          j        t           j        dd          }t%          j                    }t!          j        | j        |d|j        |j        | j        j        |	          }t%          j                    |z
  }t                              d
|dd           | j                                         t%          j                    }t!          j        | j        ||j        | j        j        |j        |j        | j        j        |          \  }}|                                }t%          j                    |z
  }t                              | j                            |d                              t;          d t=          j        | j                                         | j        !                                          D                       }	tE          |d                   |#                    d          z
  }
|
|z  }t                              d|dd|dd           t                              d|	|z  dz  dd           | j        j$        dk    rItK          j&                    }t                              d|'                                dz  dd           d S d S )Nr      )
batch_sizer   decoder_max_seq_lenz:Starting compilation to improve generation performance ...zmax-autotuneT)mode	fullgraph   )r4   rI   max_generated_tokenstemperaturetop_kstop_tokenscustom_generate_next_tokenz&Warmup run for quantized model takes: z.02fz sec)r4   rI   r^   pad_idr_   r`   ra   rb   r   c                 P    g | ]#}|                                 |j        j        z  $S  )numelr   itemsize).0ps     r'   
<listcomp>z,InferenceRecipe.generate.<locals>.<listcomp>   s:        		AG,,  r)   zTime for inference: z sec total, z tokens/seczBandwidth achieved: g    eAz GB/scpuzMemory used: z GB)(rV   rI   torchtensorintr   enable_kv_cacher6   setup_cachesr   rf   max_new_tokensr#   rE   rF   compiler	   generate_next_tokentimeperf_countergenerater_   r`   r8   ra   reset_cachesrc   tolistdecodesum	itertoolschain
parametersbufferslensizetyper   get_torch_device_namespacemax_memory_allocated)r&   r   rS   rI   rb   t0_tgenerated_tokens
model_sizetokens_generated
tokens_sectorch_devices                r'   rv   zInferenceRecipe.generatez   s   ..J
 
 fEIdlKKK%)"  	  (( +(.9K(K )                  ".KKTUUU)..^t* * *& "$$B#k%&Oi O7+E  A !##b(AKKMMMMMNNNK$$&&&  (1+!$!3?))3'A	
 	
 	
! ,2244"$DO**+;A+>??@@@ "K**,,dk.A.A.C.C   
 

 /233fkk!nnD%)
S1SSSzSSSS	
 	
 	
 	T:
+BS+HTTTTUUU<%% ;==LKKS A A C Cc ISSSS     &%s   =BB!B)__name__
__module____qualname____doc__r   r(   r:   r   strr   r   Moduler3   r   r   rn   rV   rl   inference_moderv   re   r)   r'   r   r      s        
J 
4 
 
 
 
< < < < < << sCx. 
	   2QT3YQ 
cQ Q Q Q( UIJ I I I I I Ir)   r   r   r   c                     t          j        d|            t          |           }|                    |            |                    |            d S )Nr   )recipe_namer   )r   )r   
log_configr   r:   rv   )r   recipes     r'   mainr      sW    
"3====%%%F
LLSL
OOOr)   __main__)r{   sysrt   typingr   r   r   rl   	omegaconfr   r   	torchtuner   r	   r
   r   torchtune.datar   r   torchtune.trainingr   
get_loggerrE   r   parser   r   exitre   r)   r'   <module>r      sd       



  " " " " " " " " " "                    9 9 9 9 9 9 9 9 9 9 9 9 ( ( ( ( ( ( ( ( = = = = = =		'	"	"n n n n n n n nb j T     zCHTTVV r)   