
     `i'5                         d dl mZmZ ddlmZ erddlmZ ddlmZm	Z	m
Z
mZ ddlmZ  e
            rd dlZ ej        e          Z G d	 d
e          ZdS )    )TYPE_CHECKINGOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_nameNc                        e Zd ZdZdZdZddgZ fdZd ZddZ	ddde
d
efdZddddde
ddfdZd dZ	 d!dddeee
                  fdZdee
         de
d
ee
         fdZd Zd!dZed
efd            Z xZS )"FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    TFz
fbgemm-gpu
acceleratec                 J     t                      j        |fi | || _        d S N)super__init__quantization_config)selfr   kwargs	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   zFbgemmFp8HfQuantizer.__init__+   s1    ,77777#6       c                    t                      st          d          t                      st          d          t          d          st          d          t          j                                        st          d          t          j                                        }|\  }}|dk     rt          d          |
                    d          }|t                              d	           d S |W| j        sRt          |t                    r?d
|                                v sd|                                v rt          d          d S d S d S d S )NzUsing fbgemm fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ImportErrorr
   r	   torchcudais_availableRuntimeErrorget_device_capability
ValueErrorgetloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr   s          r   validate_environmentz)FbgemmFp8HfQuantizer.validate_environment/   s   !## 	]   '(( 	F  
 'x00 	r   z&&(( 	`^___"Z==??)u199j   ZZ--
|     #&	z400	 j//1111Vz?P?P?R?R5R5R n   $#	 	 	 	 6S5Rr   dtypetorch.dtypereturnc                     |(t           j        }t                              d|           n|t           j        k    rt          d          |S )NzOverriding dtype=%s with `dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.zYYou cannot use FP8 with dtype=torch.float16.We recommend you passing dtype=torch.bfloat16)r!   bfloat16r(   infofloat16r&   )r   r3   s     r   update_dtypez!FbgemmFp8HfQuantizer.update_dtype\   s_    =NEKK@     em##k   r   modelr   
param_namec                     ddl m}m} t          ||          \  }}t	          ||          r| j        s|dk    rdS dS t	          ||          r| j        s|dk    rdS dS dS )Nr   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasFT)integrationsr?   r@   r   r+   r*   )r   r;   r<   r   r?   r@   moduletensor_names           r   param_needs_quantizationz-FbgemmFp8HfQuantizer.param_needs_quantizationl   s    NNNNNNNN25*EEfo.. 	! [F%:%:utf899 	! [F%:%:utur   param_valueztorch.Tensortarget_deviceztorch.devicec                 R   ddl m}m} t          ||          \  }}	t	          ||          rM| j        s|	dk    r+|	dk    r$|j        t          j        k    rt          d          n|	dk    rt          d          t	          ||          r(| j        s!|	dk    s|	dk    s|	d	k    rt          d          t	          ||          r|	d
k    r|
                    dd          }
|
j        }|
                    d|d                   }t          j        j                            |          \  }}|                    |          }|
                    dd          }|                    |d         d|d                   }n|	dk    r|
                    dd          }
|
j        }|
                    d|d                   }t          j        j                            |          \  }}|                    |          }|
                    dd          }|                    |d         |d         d          }t          j                            |                    |                    |j        |	 d<   nt          j        j                            |          \  }}t          j                            |                    |j        d         d                              |                    |j        |	 d<   t          j                            |                    |                    |j        |	<   ~d S )Nr   r>   rA   weightz6Expect quantized weights but got an unquantized weightweight_scalez;Expect unquantized weights but got a quantized weight_scalegate_up_proj_scaledown_proj_scalegate_up_projr   r   	down_proj_scale)rB   r?   r@   r   r+   r*   r3   r!   float8_e4m3fnr&   	transposeshapereshapeopsfbgemmquantize_fp8_per_rownn	Parameterto_parametersview)r   r;   rF   r<   rG   r   r?   r@   rC   rD   transposed_paramoriginal_shapeflattened_paramnew_value_flatweight_scale_flat	new_valuerJ   s                    r   create_quantized_paramz+FbgemmFp8HfQuantizer.create_quantized_param}   sE    	ONNNNNNN25*EE fo.. 	d! d[F%:%:(**{/@EDW/W/W$%]^^^.00$%bcccf899 	d& d+*?*?"666+IZ:Z:Z$%bcccf899 '	n,, $/#8#8A#>#>  "2!7"2":":2~b?Q"R"R 5:I4D4Y4YZi4j4j1 1 +22>BB	%//155	0889JA~^_O`aa++ $/#8#8A#>#>  "2!7"2":":2~b?Q"R"R 5:I4D4Y4YZi4j4j1 1 +22>BB	%//155	0889JN[\L]_`aa9>9K9KLOO\iLjLj9k9kF+55566&+i&6&K&KK&X&X#I|9>9K9K!!,"4Q"7;;>>}MM: :F+5556 +0(*<*<Y\\-=X=X*Y*Y;'JJr   c                     |S r    )r   r;   r   s      r   #_process_model_after_weight_loadingz8FbgemmFp8HfQuantizer._process_model_after_weight_loading   s    r   Nkeep_in_fp32_modulesc                     ddl m} |j        }|                     || j        j        |          | _        |j        } ||| j        | j        | j        ||          }| j        |j        _        d S )Nr   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r*   configtp_plan)rB   ri   _tp_planget_modules_to_not_convertr   rj   rk   r*   )r   r;   rg   r   ri   rl   rk   s          r   $_process_model_before_weight_loadingz9FbgemmFp8HfQuantizer._process_model_before_weight_loading   s     	BAAAAA.&*&E&E4+BDX'
 '
# ..#'#> $ 8,
 
 
 ,0+C(((r   missing_keysprefixc                 .  	 ddl m}m} g 	|                                D ]h\  }}t	          |||f          rQ|D ]N}||v s	|| d| v r?|                    d          s*|                    d          s	                    |           Oi	fd|D             S )Nr   r>   .z.weightz.biasc                     g | ]}|v|	S re   re   ).0knot_missing_keyss     r   
<listcomp>z<FbgemmFp8HfQuantizer.update_missing_keys.<locals>.<listcomp>   s$    EEEa14D+D+D+D+D+Dr   )rB   r?   r@   named_modulesr+   endswithappend)
r   r;   rp   rq   r?   r@   namerC   missingrw   s
            @r   update_missing_keysz(FbgemmFp8HfQuantizer.update_missing_keys   s    NNNNNNNN!//11 	9 	9LD&&?4N"OPP 9+ 9 9GDv4I4I4I4I,I,I ' 0 0 ; ; -J ' 0 0 9 9 -J )//888EEEE<EEEEr   c                    d|j         j        v rxi ddddddddddddd	d
dddddddddddddddddd
dddddddd
dddd	}|                                ||                                _        n||_        |S |S )NLlama4z layers.*.self_attn.q_proj.weightlocal_colwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightlocal_rowwisezlayers.*.self_attngatherzlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightzlayers.*.feed_forward.expertslocallocal_packed_rowwise)	zlayers.*.feed_forwardz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalez.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   rk   	text_plans      r   update_tp_planz#FbgemmFp8HfQuantizer.update_tp_plan   s\   v'000$ 3O	$
 9/$ 3O$ 9/$ 3O$ 9/$ 3O$ %h$ 23F$ ;<O$ 2$& G'$( Mo)$* Eo+$, KO-$. G/$0 01$2 *2DSJYBQHWDS ?UDZ;JG$ $ $IJ %%''3>G&&((;;,5)Mr   c                     dS )NTre   )r   safe_serializations     r   is_serializablez$FbgemmFp8HfQuantizer.is_serializable  s    tr   c                     dS )NFre   )r   s    r   is_trainablez!FbgemmFp8HfQuantizer.is_trainable   s    ur   )r3   r4   r5   r4   )r;   r   r   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r2   r:   strboolrE   rc   rf   r   listro   r~   r   r   propertyr   __classcell__)r   s   @r   r   r   !   s         (,$ %|47 7 7 7 7+ + +Z    .? S _c    "D D $D 	D
 &D D D DL    59D D D 'tCy1D D D D2FtCy F# FRVWZR[ F F F F- - -^    d    X    r   r   )typingr   r   baser   modeling_utilsr   utilsr	   r
   r   r   quantizers_utilsr   r!   
get_loggerr   r(   r   re   r   r   <module>r      s    + * * * * * * *        1000000 a a a a a a a a a a a a 2 2 2 2 2 2  LLL 
	H	%	%A A A A A; A A A A Ar   