
     `i	%                         d dl mZmZ ddlmZmZmZmZ ddlm	Z	 ddl
mZ  e            rd dlZerddlmZ  ej        e          Z G d	 d
e	          ZdS )    )TYPE_CHECKINGOptional   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                        e Zd ZdZdZdZdgZ fdZd Zdd
Z	ddddde
ddfdZddde
d	efdZ	 d dddeee
                  fdZd!dZdee
         de
d	ee
         fdZd Zd dZed	efd            Zd Z xZS )"FineGrainedFP8HfQuantizerz
    FP8 quantization implementation supporting both standard and MoE models.
    Supports both e4m3fn formats based on platform.
    TF
acceleratec                 J     t                      j        |fi | || _        d S N)super__init__quantization_config)selfr   kwargs	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr   z"FineGrainedFP8HfQuantizer.__init__   s1    ,77777#6       c                 \   t                      st          d          t                      st          d          |                    dd          s|                    dd          rt	          d          t
          j                                        st                      st          d          t
          j                                        rKt
          j        
                                }|\  }}|dk     s|dk    r|d	k     rt	          d
| d| d          |                    d          }|t                              d           d S |W| j        sRt          |t                    r?d|                                v sd|                                v rt	          d          d S d S d S d S )NzxUsing fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)from_tfF	from_flaxzConverting into FP8 weights from tf/flax weights is currently not supported, please make sure the weights are in PyTorch format.zANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.`
device_mapzYou have loaded an FP8 model on CPU and have a CUDA or XPU device available, make sure to set your model on a GPU or XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   get
ValueErrortorchcudais_availabler   RuntimeErrorget_device_capabilityloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr"   s          r   validate_environmentz.FineGrainedFP8HfQuantizer.validate_environment   s   !## 	]  
 '(( 	omnnn::i'' 	6::k5+I+I 	F  
 
'')) 	d-C-E-E 	dbccc:""$$ 	!&!A!A!C!C-LE5		uzzeaii 4#(4 4+04 4 4  
 ZZ--
6    
 #&	z400	 j//1111Vz?P?P?R?R5R5R k   $#	 	 	 	 6S5Rr   dtypetorch.dtypereturnc                 V    |&t                               d           t          j        }|S )NzKSetting dtype to torch.float32 as no dtype was specified in from_pretrained)r-   infor(   float32)r   r8   s     r   update_dtypez&FineGrainedFP8HfQuantizer.update_dtypeN   s&    =KKefffMEr   modelr   param_valueztorch.Tensor
param_nametarget_deviceztorch.devicec                 .   ddl m} ddlm} t	          ||          \  }}	t          ||          rM| j        s|	dk    r+|	dk    r$|j        t          j	        k    rt          d          n|	dk    rt          d          |                    |          }t          j        t          j	                  j        }
t          j        t          j	                  j        }| j        j        \  }}|j        d	d          \  }}||z  d
k    s	||z  d
k    rt          d| d| d| d| d	          |j        }|                    d||z  |||z  |                              d
dddd          }t          j        t          j        |          d          }||z  }|j        }|                    d                              d          }t          j        ||z  |
|                              t          j	                  }|                    d
dddd          }|                    |          }|                    |                                                                          } ||||            |||                    dd          d
         dz   |           d S )Nr   	FP8Linear)_load_parameter_into_modelbiasweightz6Expect quantized weights but got an unquantized weightweight_scale_invz;Expect unquantized weights but got a quantized weight_scaler   zMatrix dimensions (z, z$) must be divisible by block sizes ()r
         )rL   rJ   )dim)minmaxr    z.weight_scale_inv)integrations.finegrained_fp8rE   modeling_utilsrF   r   r0   r/   r8   r(   float8_e4m3fnr'   tofinforP   rQ   r   weight_block_sizeshapereshapepermuteamaxabs	unsqueezeclampsqueeze
reciprocalrsplit)r   r?   r@   rA   rB   r   rE   rF   moduletensor_namefp8_minfp8_maxblock_size_mblock_size_nrowscolsparam_value_orig_shapemax_absscalescale_orig_shapequantized_params                        r   create_quantized_paramz0FineGrainedFP8HfQuantizer.create_quantized_paramT   s    	=<<<<<?????? 35*EEfi(( 	d! d[F%:%:(**{/@EDW/W/W$%]^^^"444$%bccc!nn]33 +e1226+e1226%)%=%O"l &rss+
d,!##tl':a'?'?wdwwdwwXdwwhtwww   "-!2!))$lDL4H,
 

'!Q1a
 
  	
 *UY{33BBB'! ;##--b11  +kE&9wGTTTWWX]Xkll)11!Q1a@@)112HII .//7799DDFF 	#"5*oFFF""5**;*;C*C*CA*FI\*\^cdddddr   c                 z    ddl m} t          ||          \  }}t          ||          r| j        s|dk    rdS dS dS )Nr   rD   rG   FT)rR   rE   r   r0   r/   )r   r?   rA   r   rE   rb   rc   s          r   param_needs_quantizationz2FineGrainedFP8HfQuantizer.param_needs_quantization   s^    <<<<<<25*EEfi(( 	! [F%:%:utur   Nkeep_in_fp32_modulesc                     ddl m} |                     || j        j        |          | _         ||| j        | j                  }| j        |j        _        d S )Nr   )replace_with_fp8_linear)modules_to_not_convertr   )rR   rt   get_modules_to_not_convertr   ru   config)r   r?   rr   r   rt   s        r   $_process_model_before_weight_loadingz>FineGrainedFP8HfQuantizer._process_model_before_weight_loading   sz     	KJJJJJ&*&E&E4+BDX'
 '
# ('#'#> $ 8
 
 
 ,0+C(((r   c                     |S r    )r   r?   r   s      r   #_process_model_after_weight_loadingz=FineGrainedFP8HfQuantizer._process_model_after_weight_loading   s    r   missing_keysprefixc                 &   ddl m} g |                                D ]f\  }}t          ||          rQ|D ]N}||v s	|| d| v r?|                    d          s*|                    d          s                    |           Ogfd|D             S )Nr   rD   r    z.weightz.biasc                     g | ]}|v|	S rz   rz   ).0knot_missing_keyss     r   
<listcomp>zAFineGrainedFP8HfQuantizer.update_missing_keys.<locals>.<listcomp>   s$    EEEa14D+D+D+D+D+Dr   )integrationsrE   named_modulesr0   endswithappend)	r   r?   r|   r}   rE   namerb   missingr   s	           @r   update_missing_keysz-FineGrainedFP8HfQuantizer.update_missing_keys   s    ,,,,,,!//11 	9 	9LD&&),, 9+ 9 9GDv4I4I4I4I,I,I ' 0 0 ; ; -J ' 0 0 9 9 -J )//888EEEE<EEEEr   c                     d|j         j        v r9i ddddddddddddd	d
dd
dddddddddddd
dd
dd}||_        |S )NQwen3z layers.*.self_attn.q_proj.weightlocal_colwisez*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightlocal_rowwisez*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.self_attngatherzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_invzlayers.*.mlp)r   __name__base_model_tp_plan)r   rw   	text_plans      r   update_tp_planz(FineGrainedFP8HfQuantizer.update_tp_plan   s    f&///2O<o 3O =o	
 3O =o 3O =o %h 0 :? . 8 0 :?  !I& )2F%r   c                     dS )NTrz   )r   safe_serializations     r   is_serializablez)FineGrainedFP8HfQuantizer.is_serializable   s    tr   c                     dS )NFrz   r   s    r   is_trainablez&FineGrainedFP8HfQuantizer.is_trainable   s    ur   c                     dS )Nr   rz   r   s    r   get_accelerator_warm_up_factorz8FineGrainedFP8HfQuantizer.get_accelerator_warm_up_factor   s    qr   )r8   r9   r:   r9   r   )r?   r   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r7   r>   strro   boolrq   r   listrx   r{   r   r   r   propertyr   r   __classcell__)r   s   @r   r   r      s        
 (,$ %7 7 7 7 7- - -^   ;e ;e $;e 	;e
 &;e ;e ;e ;ez	.? 	S 	_c 	 	 	 	 59D D D 'tCy1D D D D(   FtCy F# FRVWZR[ F F F F  2    d    X      r   r   )typingr   r   utilsr   r   r   r	   baser   quantizers_utilsr   r(   rS   r   
get_loggerr   r-   r   rz   r   r   <module>r      s    * * * * * * * * ` ` ` ` ` ` ` ` ` ` ` `       2 2 2 2 2 2  LLL 1000000		H	%	%S S S S S S S S S Sr   