
    .`i(-                     ,   d dl mZmZ d dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlmZmZ d dl m!Z!  ee"          Z# G d de          Z$ G d de          Z% G d de          Z&dS )    )AnyOptionalN)	Parameter)	Attention)init_logger)
LinearBaseLinearMethodBaseUnquantizedLinearMethod)QuantizationMethods)QuantizationConfigQuantizeMethodBase)BaseKVCacheMethod)apply_petit_nvfp4_linearprepare_nvfp4_layer_for_petitverify_petit_nvfp4_supported)is_layer_skipped)ModelWeightParameterPerTensorScaleParameter)current_platformc                      e Zd ZdZ	 	 	 	 ddededz  dedz  dee         dz  ddf
d	Zdd
Z	e
defd            Ze
deej                 fd            Ze
defd            Ze
dee         fd            Ze
deeef         dd fd            Ze
dedz  fd            Ze
deeef         defd            Zdedee         defdZdej        j        deded         fdZdee         fdZdefdZdefdZdee         fdZdS )PetitNvFp4ConfigzConfig class for Petit FP4.FNis_checkpoint_nvfp4_serializedkv_cache_quant_algo
group_sizeexclude_modulesreturnc                     |                                   || _        |rt                              d           || _        || _        || _        d S )Nz]Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.)_check_hardware_supportr   loggerwarningr   r   r   )selfr   r   r   r   s        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/petit.py__init__zPetitNvFp4Config.__init__*   sa     	$$&&&.L+) 	NN@   %#6 .    c                 J    t          j                    rt          d          dS )z
        Verifies that the current hardware is supported by the Petit backend.
        This backend is specifically designed for AMD GPUs and is not
        supported on the CUDA platform.
        zThe 'petit' quantization backend is designed for AMD GPUs and is not supported on the CUDA platform. For NVIDIA GPUs, please use a different quantization method such as FP8, AWQ, or GPTQ.N)r   is_cuda
ValueErrorr!   s    r"   r   z(PetitNvFp4Config._check_hardware_support<   s6     #%% 	  	 	r$   c                     dS )Npetit_nvfp4 clss    r"   get_namezPetitNvFp4Config.get_nameK   s    }r$   c                 2    t           j        t           j        gS N)torchbfloat16halfr,   s    r"   get_supported_act_dtypesz)PetitNvFp4Config.get_supported_act_dtypesO   s    
++r$   c                     dS )NZ   r+   r,   s    r"   get_min_capabilityz#PetitNvFp4Config.get_min_capabilityS   s	     rr$   c                     dgS )Nzhf_quant_config.jsonr+   r,   s    r"   get_config_filenamesz%PetitNvFp4Config.get_config_filenamesX   s    &''r$   configc                    |                      |dg          }|                    d          }t          |t                    r|st	          d          |                                }|                    d          }t          |t                    st	          d          |}t          ||           |                    d          pd}t          |t                    st	          d          |}|                    d	g           }	|	g }
n@t          |	t                    rt          d
 |	D                       r|	}
nt	          d          d|v } | ||||
          S )Nquantization
quant_algoz7Missing or invalid 'quant_algo' in quantization config.r   z>Missing or invalid 'group_size' (int) in hf_quant_config.json.r   autoz3'kv_cache_quant_algo' must be a string if provided.r   c              3   @   K   | ]}t          |t                    V  d S r0   )
isinstancestr).0xs     r"   	<genexpr>z/PetitNvFp4Config.from_config.<locals>.<genexpr>v   s=       3
 3
#$Jq#3
 3
 3
 3
 3
 3
r$   z3'exclude_modules' must be a list[str] (or omitted).NVFP4)r   r   r   r   )
get_from_keysgetr@   rA   r'   upperintr   listall)r-   r:   qcquant_method_rawquant_methodgroup_size_rawr   kv_cache_quant_algo_rawr   exclude_rawr   r   s               r"   from_configzPetitNvFp4Config.from_config\   s   v'78866,//*C00 	X8H 	XVWWW'--//--.#.. 	P   $
$\:>>>"$&&)>"?"?"I61377 	TRSSS5ff.33)+OOT** 	Ts 3
 3
(33
 3
 3
 0
 0
 	T *OORSSS)0L)@&s+I 3!+	
 
 
 	
r$   c                    t          j                    sd S |                    d|          }|                    d          p|                    d          pd                                }|dv r|                                 S d S )Nr<   r=   rN    )rE   MODELOPT_FP4MODELOPT)r   is_rocmrG   rH   r.   )r-   hf_quant_cfg
user_quantrL   algos        r"   override_quantization_methodz-PetitNvFp4Config.override_quantization_method   s      ')) 	4nl;;|$$D~(>(>D"KKMM888<<>>!tr$   quant_configc                     |                     d|          }|                     d          p|                     d          pd                                }|dk    S )Nr<   r=   rN   rT   rE   )rG   rH   )r-   r\   rL   rZ   s       r"   is_petit_nvfp4_compatiblez*PetitNvFp4Config.is_petit_nvfp4_compatible   sT    nl;;|$$D~(>(>D"KKMMwr$   prefixc                     |D ]D}|                     dd                               dd          }t          j        ||          r dS EdS )N.z\.*z.*TF)replacere	fullmatch)r!   r_   r   pattern	regex_strs        r"   is_layer_excludedz"PetitNvFp4Config.is_layer_excluded   sY    & 	 	GU33;;CGGI|Iv.. ttur$   layerr   c                 &   |                                  }t          |t                    rCt          ||          s|                     ||          rt                      S t          |           S t          |t                    rt          |           S d S r0   )	require_exclude_modulesr@   r   r   rh   r
   PetitNvFp4LinearMethodr   PetitFp8KVCacheMethod)r!   ri   r_   excludes       r"   get_quant_methodz!PetitNvFp4Config.get_quant_method   s     ..00eZ(( 	/00 1D4J4J5 5 1 /000)$///y)) 	/(...tr$   c                     g S r0   r+   r(   s    r"   get_scaled_act_namesz%PetitNvFp4Config.get_scaled_act_names   s    	r$   c                 V    | j         t                              d           dS | j         S )Nz/group_size not set; defaulting to 16 for NVFP4.   )r   r   r    r(   s    r"   require_group_sizez#PetitNvFp4Config.require_group_size   s*    ?"NNLMMM2r$   c                     | j         pdS )Nr>   )r   r(   s    r"   require_kv_cache_quant_algoz,PetitNvFp4Config.require_kv_cache_quant_algo   s    '161r$   c                 .    t          | j        pg           S r0   )rJ   r   r(   s    r"   rk   z(PetitNvFp4Config.require_exclude_modules   s    D(.B///r$   )FNNN)r   N) __name__
__module____qualname____doc__boolrA   rI   rJ   r#   r   classmethodr   r.   r1   dtyper4   r7   r9   dictr   rR   r[   r^   rh   nnModuler   ro   rq   rt   rv   rk   r+   r$   r"   r   r   '   s       %% 05*.!%,0/ /(,/ !4Z/ $J	/
 cT)/ 
/ / / /$    ,    [ ,ek): , , , [, 3    [ (T#Y ( ( ( [( '
c3h '
4F '
 '
 '
 ['
R 
	t	#
 
 
 [
 T#s(^     [
 d3i D    X_.1	&	'   d3i    C    2S 2 2 2 20c 0 0 0 0 0 0r$   r   c                   (     e Zd ZdZdef fdZ xZS )rm   zI
    Supports loading kv-cache scaling factors from FP8 checkpoints.
    r\   c                 J    t                                          |           d S r0   )superr#   )r!   r\   	__class__s     r"   r#   zPetitFp8KVCacheMethod.__init__   s!    &&&&&r$   )rx   ry   rz   r{   r   r#   __classcell__)r   s   @r"   rm   rm      sO         '%5 ' ' ' ' ' ' ' ' ' 'r$   rm   c                       e Zd ZdZdefdZdej        j        de	de
e	         de	de	d	ej        fd
Zdej        j        ddfdZ	 ddej        j        dej        dej        dz  dej        fdZdS )rl   a8  Linear method for NVFP4.
    Supports loading NVFP4 checkpoints with the following structure:

    |Tensor Name           | datatype      |  shape      |
    |----------------------------------------------------|
    |input_scale           | torch.float32 | scalar      |
    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
    |weight_scale          | FP8-E4M3      | [X, Y]      |
    |weight_scale_2        | torch.float32 | scalar      |

    The weights are quantized per block of 16 elements.
    Args: quant_config: The ModelOpt quantization config.
    r\   c                     || _         d S r0   )r\   )r!   r\   s     r"   r#   zPetitNvFp4LinearMethod.__init__   s    (r$   ri   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                    ~~| j         j        st          d          t          |          }|                    d          }	||_        ||_        ||_        |dz  dk    rt          d          | j         j        rt          j	        n|}
t          t          j        ||dz  t          j                  dd|		          }|                    d
|           t          t          j        t          |          t          j                  |	          }|                    d|           t          t          j        t          |          t          j                  |	          }|                    d|           | j                                         }t          t          j        |||z  |
          dd|		          }|                    d|           d S )NzHNVFP4 quantization was selected,  dynamic quantization is not supported.weight_loaderrs   r   z=Unsupported model when in features size is not multiple of 16   )r~      )data	input_dim
output_dimr   weight)r   r   input_scaleweight_scale_2weight_scale)r\   r   r'   sumrG   logical_widthsr   output_size_per_partitionr1   float8_e4m3fnr   emptyuint8register_parameterr   lenfloat32rt   )r!   ri   r   r   r   r   r   extra_weight_attrsr   r   weight_dtyper   r   r   r   r   s                   r"   create_weightsz%PetitNvFp4LinearMethod.create_weights   s     ? 	:  
 %((>$?$?!*..??5)A&*C'#b(A--O    ?E 	 &)(A-k	   '

 

 

 	  6222-S!788NNN'
 
 

 	  <<<0S!788NNN'
 
 
 	  !1>BBB&99;;
+)(J6"  
 '	
 	
 	
 	  >>>>>r$   r   Nc                    |j                                                             t          j                  }|j                                                            t          j                  }t          |d          |_         t          |d          |_        t          |j         |j        z  d          |_        t          |           |` d S )NF)requires_grad)	r   maxtor1   r   r   r   alphar   )r!   ri   input_scale_2r   s       r"   process_weights_after_loadingz4PetitNvFp4LinearMethod.process_weights_after_loading%  s    )--//225=AA-113366u}EE%m5III(uMMM 44E
 
 
 	&e,,,r$   rC   biasc           	      `    t          ||j        |j        |j        |j        |j        |          S )N)inputr   r   r   size_nsize_kr   )r   r   r   r   r   r   )r!   ri   rC   r   s       r"   applyzPetitNvFp4LinearMethod.apply1  s>     (<+ /21
 
 
 	
r$   r0   )rx   ry   rz   r{   r   r#   r1   r   r   rI   rJ   r~   r   r   Tensorr   r+   r$   r"   rl   rl      s        )%5 ) ) ) )I?xI? #&I? !%S		I?
 I? I? kI? I? I? I?V
58? 
t 
 
 
 
  %)	
 
x
 <
 lT!	

 

 
 
 
 
 
r$   rl   )'typingr   r   regexrd   r1   torch.nn.parameterr   vllm.attention.layerr   vllm.loggerr   !vllm.model_executor.layers.linearr   r	   r
   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.quantization.base_configr   r   0vllm.model_executor.layers.quantization.kv_cacher   9vllm.model_executor.layers.quantization.utils.petit_utilsr   r   r   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.model_executor.parameterr   r   vllm.platformsr   rx   r   r   rm   rl   r+   r$   r"   <module>r      s  
 !                    ( ( ( ( ( ( * * * * * * # # # # # #         
 H G G G G G        O N N N N N         
 W V V V V V W W W W W W W W + + + + + + 
X		
U0 U0 U0 U0 U0) U0 U0 U0p' ' ' ' '- ' ' 'w
 w
 w
 w
 w
- w
 w
 w
 w
 w
r$   