
     `iK                         d dl mZmZ ddlmZ erddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZ  e
            rd dlZ ej        e          ZdZ G d	 d
e          ZdS )    )TYPE_CHECKINGOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameNc                   J    e Zd ZdZdZdZdgZ fdZd Zd Z	d&dZ
ddded
efdZdddddeddfdZd'dZdddee         dee         fdZ	 d(dddeee                  fdZdee         ded
ee         fdZd Zd  Zded
efd!Zd)d"efd#Zd(d$Zed
efd%            Z xZS )*Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    TF
acceleratec                 X     t                      j        |fi | || _        d | _        d S N)super__init__quantization_configtriton_kernels_hub)selfr   kwargs	__class__s      {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   zMxfp4HfQuantizer.__init__1   s9    ,77777#6 "&    c                     | j         5	 ddlm}  |d          | _         n# t          $ r t          d          w xY w| j         S )z3Lazy import and initialize kernels only when neededNr   )
get_kernelz kernels-community/triton_kernelsz2kernels package is required for MXFP4 quantization)r   kernelsr   ImportError)r   r   s     r   _lazy_import_kernelsz%Mxfp4HfQuantizer._lazy_import_kernels6   sp    "*X......*4*5W*X*X'' X X X!"VWWWX&&s     :c                    t                      st          d          | j        j        rd S t          j                                        s\t          j                                        s>| j        r(t          
                    d           d| j        _        d S t          d          t                      st          d          t          j                                        r d}t          d          ot                      }nAt          j                                        }|dk    }t          d          ot                      }| j        rU|s(t          
                    d	           d| j        _        d S |s(t          
                    d
           d| j        _        d S n"|st!          d          |st!          d          | j        s|                                  |                    d          }|t          
                    d           d S |W| j        sRt'          |t(                    r?d|                                v sd|                                v rt!          d          d S d S d S d S )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`z3.5.0)      z3.4.0u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) zuMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0
device_mapzYou have loaded an FP4 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or device_map = 'xpu'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r    r   
dequantizetorchcudais_availablexpupre_quantizedloggerwarning_onceRuntimeErrorr	   r   r
   get_device_capability
ValueErrorr!   get
isinstancedictvalues)r   argsr   gpu_is_supportedkernels_availablecompute_capabilityr%   s          r   validate_environmentz%Mxfp4HfQuantizer.validate_environmentA   s   !## 	]  
 #. 	F
'')) 	TUY-C-C-E-E 	T! T##t   7;(3"#RSSS&(( 	[YZZZ9!!## 	X# 3G < < WAUAWAW!&!A!A!C!C1V; 3G < < WAUAWAW 	# ##I   7;(3$ ##    7;(3 " 		 r   # 	 H   ! 	(%%'''ZZ--
V     #&	z400	 j//1111Vz?P?P?R?R5R5R n   $#	 	 	 	 6S5Rr   dtypetorch.dtypereturnc                 X    |'t           j        }t                              d|           |S )NzOverriding dtype=%s with `dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.)r)   bfloat16r.   info)r   r<   s     r   update_dtypezMxfp4HfQuantizer.update_dtype   s6    =NEKK@    r   modelr   
param_namec                 (   ddl m} ddlm} | j        j        r2d|v sd|v r*t          ||d t          d                              \  }}nt          ||          \  }}t          ||          st          ||          r| j        j        r|dv rdS d	S dS )
Nr   Mxfp4GptOssExpertsGptOssExpertsblocksscales_blocks)down_proj_biasgate_up_proj_biasFT)	integrationsrG   models.gpt_oss.modeling_gpt_ossrI   r   r(   r   lenr4   )r   rC   rD   r   rG   rI   moduletensor_names           r   param_needs_quantizationz)Mxfp4HfQuantizer.param_needs_quantization   s    555555CCCCCC #. 	JH
4J4JhZdNdNd"6ujIZCPYNN?IZ>["\"\FKK"6uj"I"IFKf011 	v}--	262J2U	 EEEu4ur   param_valueztorch.Tensortarget_deviceztorch.devicec                    ddl m}m}m}m}	m}
 ddlm} | j        s| 	                                }t          ||          \  }}t          j        |          5  t          ||          r |	||          \  }}|j        j        |j        j        |j        j        }}} |
|||          \  }}d|v rdnd}t%          |||           t%          || d || | |                                           t'          || d	           t'          || d
           d d d            d S # 1 swxY w Y   d S |                    d          }|                    d          }|                    d          }|                    d          }|                    d          }d|v sd|v r6| j        j        r*t          ||d t-          d	                              \  }}nt          ||          \  }}||||||d}t          ||          st          ||          r`| j        j        rV| j        j        r'|d t-          d	                    } ||||||fi | d S  |||||| 	                                fi | d S d S d S )Nr   )rG   r(   load_and_swizzle_mxfp4quantize_to_mxfp4swizzle_mxfp4rH   gate_up_proj	down_proj_precision_config)rhs_data)weight_scaleflex_ctxrL   _scalesempty_paramcasting_dtypeto_contiguousrankdevice_meshrJ   rK   )rb   rc   rd   re   rf   rC   )rO   rG   r(   rX   rY   rZ   rP   rI   r-   r!   r   r)   devicer4   
matmul_ogsPrecisionConfigFlexCtx
InFlexDatasetattrdelattrr3   r   rQ   )r   rC   rU   rD   rV   r   rG   r(   rX   rY   rZ   rI   r   rR   _triton_weight_tensorr_   ri   rj   rk   projrb   rc   rd   re   rf   shard_kwargsdq_param_names                               r   create_quantized_paramz'Mxfp4HfQuantizer.create_quantized_param   s   	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	DCCCCC! A	!%!:!:!<!<,UJ??IFAm,, 6 6f&899 69J9J;Xj9k9k6(,*5E*5=*5@ /9WO
 :G,l<N: :6(, .<z-I-I>>{DFD*>???222'\GG]g]g]i]iLjLjLjkkk   Ft$4$4$4555Ft$4$4$4555+6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 64 !**]33K"JJ77M"JJ77M::f%%D **]33KJ&&(j*@*@dF^Fi*@0
CTc)nn_CT8UVV	0
CC	  +!.!.* L &"455 6=116:6N6Y +6  %//@#i../@$AMJvz;}mm`lmmmmm**"#%1133  '       s   CD))D-0D-c                 @   | j         j        r|                     |           t          j                                        r t          j                                         d S t          j                                        r t          j                                         d S d S r   )r   r(   remove_quantization_configr)   r*   r+   empty_cacher,   )r   rC   r   s      r   #_process_model_after_weight_loadingz4Mxfp4HfQuantizer._process_model_after_weight_loading  s    #. 	3++E222:""$$ 	$J""$$$$$Y##%% 	$I!!#####	$ 	$r   expected_keyscheckpoint_keysc                 @   g }|D ]}|                     d          rI|d t          d                    }|                    |dz              |                    |dz              a|                     d          rI|d t          d                    }|                    |dz              |                    |dz              | j        s|                     d	          r2|d t          d                    }|                    |dz              |                     d
          r2|d t          d                    }|                    |dz              T|                     d          rk|                    |           |                    |           |S )Nz.mlp.experts.gate_up_projr[   gate_up_proj_blocksgate_up_proj_scalesz.mlp.experts.down_projr\   down_proj_blocksdown_proj_scalesz.mlp.experts.down_proj_blocksz .mlp.experts.gate_up_proj_blocksrK   )endswithrQ   appendr-   )r   rC   rx   ry   new_expected_keyskeybases          r   update_expected_keysz%Mxfp4HfQuantizer.update_expected_keys  s     	. 	.C||788 .1c.11112!((0E)EFFF!((0E)EFFFF677 ..c+..../!((0B)BCCC!((0B)BCCCC' .<< ?@@ 
29#&8"9"9!99:D%,,TK-?@@@@\\"DEE 2<#&;"<"<!<<=D%,,TN-BCCCC\\(++ 2%,,S1111!((----  r   Nkeep_in_fp32_modulesc                 <   ddl m} |                     || j        j        |          | _        |                    dd          }|r&t                              d           d| j        _        |j	        } ||| j        | j        |          }| j        |j	        _        d S )Nr   )replace_with_mxfp4_linearuse_kernelsFzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   config)
rO   r   get_modules_to_not_convertr   r   r3   r.   r/   r(   r   )r   rC   r   r   r   r   r   s          r   $_process_model_before_weight_loadingz5Mxfp4HfQuantizer._process_model_before_weight_loading(  s     	=<<<<<&*&E&E4+BDX'
 '
# jj66 	7e   37D$/))#'#> $ 8	
 
 
 ,0+C(((r   missing_keysprefixc                 &   ddl m} g |                                D ]f\  }}t          ||          rQ|D ]N}||v s	|| d| v r?|                    d          s*|                    d          s                    |           Ogfd|D             S )Nr   rF   .z.weightz.biasc                     g | ]}|v|	S  r   ).0knot_missing_keyss     r   
<listcomp>z8Mxfp4HfQuantizer.update_missing_keys.<locals>.<listcomp>T  s$    EEEa14D+D+D+D+D+Dr   )rO   rG   named_modulesr4   r   r   )	r   rC   r   r   rG   namerR   missingr   s	           @r   update_missing_keysz$Mxfp4HfQuantizer.update_missing_keysG  s    555555!//11 	9 	9LD&&"455 9+ 9 9GDv4I4I4I4I,I,I ' 0 0 ; ; -J ' 0 0 9 9 -J )//888EEEE<EEEEr   c                     d|j         j        v r0t          |dd           |j                            ddddd           |S )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrr   updater   r   s     r   update_tp_planzMxfp4HfQuantizer.update_tp_planV  ^    V-666v3T::F)00DRDRAOAO	    r   c                     d|j         j        v r0t          |dd           |j                            ddddd           |S )Nr   base_model_ep_planr   r   )r   r   r   r   r   r   s     r   update_ep_planzMxfp4HfQuantizer.update_ep_planc  r   r   c                 B   | j         j        r5d|v r|                    dd          S d|v r|                    dd          S n]| j        sV|                    d          r|                    dd          S |                    d          r|                    dd          S |S )NrL    ra   r[   r{   r\   r}   )r   r(   replacer-   r   )r   rD   s     r   get_param_namezMxfp4HfQuantizer.get_param_namep  s    #. 		KJ&&!)))R888j((!)))R888 )# 	K"">22 Q!)).:OPPP"";// K!))+7IJJJr   safe_serializationc                    ddl m} |                                }|                                D ]\  }}t	          ||          rt          |d          rt          |d          rt|j        j        j        	                    |j        j        j
                                      dd                              dddd	          || d
<   |j        j        j        j        	                    |j        j        j        j
                                      dd          || d<   |j        j        j        	                    |j        j        j
                                      dd                              dddd          || d<   |j        j        j        j        	                    |j        j        j        j
                                      dd          || d<   i }||fS )Nr   rF   r[   r\       Z      z.gate_up_proj_blocksz.gate_up_proj_scalesi@  z.down_proj_blocksz.down_proj_scales)rO   rG   
state_dictr   r4   hasattrr[   storagelayoutunswizzle_datadata	transposereshapegate_up_proj_precision_configr_   r\   down_proj_precision_config)r   rC   r   rG   r   r   rR   metadatas           r   get_state_dict_and_metadataz,Mxfp4HfQuantizer.get_state_dict_and_metadata}  s   555555%%''
!//11 	 	LD&6#566FN33 FK00 '/6EEfFYFaFfggYr2&&WRR,, d8889 8EMTcc<IQV iB'' d8889 $,3BB6CSC[C`aaYr2&&WRr2.. d5556 5BJQ``9FNS iB'' d5556 8##r   c                     dS )NTr   )r   r   s     r   is_serializablez Mxfp4HfQuantizer.is_serializable  s    tr   c                 :    t                               d           dS )NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r.   r/   )r   s    r   is_trainablezMxfp4HfQuantizer.is_trainable  s'     x	
 	
 	
 ur   )r<   r=   r>   r=   )rC   r   r   )F)r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r!   r;   rB   strboolrT   rs   rw   listr   r   r   r   r   r   r   r   r   propertyr   __classcell__)r   s   @r   r   r   '   s^         (,$ %' ' ' ' '
	' 	' 	'M M M^
 
 
 
.? S _c    "R R $R 	R
 &R R R Rh$ $ $ $!*; !DQTI !hlmphq ! ! ! !@ 59D D D 'tCy1D D D D>FtCy F# FRVWZR[ F F F F         !$ !$T !$ !$ !$ !$F    d    X    r   r   )typingr   r   r   r   modeling_utilsr   utilsr	   r
   r   r   r   quantizers_utilsr   r)   
get_loggerr   r.   r   r   r   r   r   <module>r      s   + * * * * * * *        1000000              3 2 2 2 2 2  LLL		H	%	% A A A A A{ A A A A Ar   