
     `i;                        d dl mZ ddlmZmZmZmZ  e            rd dlZd dlm	Z	 d dl
Z
d dlmZ d dlmZ  e            rd dlmZ  ej        e          Ze
j        dej        fd            Zd#d
ej        dedeej        ej        f         fdZe
j        dej        dej        dej        dej        fd            Zej        fdej        dej        dej        dej        dee         dej        dej        fdZ ej!        dej        fdej        dej        dej        dej        deeeef                  dej        dej        fd            Z" G d de	j#                  Z$	 	 	 	 	 d$d!Z%	 	 d%d"Z&dS )&    )Optional   )is_accelerate_availableis_torch_accelerator_availableis_torch_availableloggingN)
functional)init_empty_weights
BLOCK_SIZEc                    t          j        d          }||z  t          j        d|          z   }t          j        | |z                                 t           j                  }t          j        t          j        |                    dz  }||z  }|                    |j        j	                  }t          j
        ||z   |           t          j
        ||z   |           d S )Nr   axisg      |@)tl
program_idarangeloadtofloat32maxabsdtype
element_tystore)	x_ptry_ptrs_ptrr   pidoffsxsys	            }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/integrations/finegrained_fp8.pyact_quant_kernelr#   $   s    
-Q


Cbi:666D
  ,,A
rvayyE!A	AA	U[#$$AHUT\1HUS[!       r   
block_sizereturnc                 x                                      sJ  j        d         |z  dk    sJ t          j         t          j                  }  j        g                                  d d                              d          |z  R dt          j        i} fd}t          |          |||           ||fS )Nr   r   r   c                 `    t          j                                        | d                   fS )Nr   )tritoncdivnumel)metar   s    r"   gridzact_quant.<locals>.grid6   s%    AGGIItL'9::<<r$   )r   )	is_contiguousshapetorch
empty_likefloat8_e4m3fn	new_emptysizer   r#   )r   r&   r!   r    r0   s   `    r"   	act_quantr8   0   s    ??72;#q((((%"5666ARQVVXXcrc]RAFF2JJ*$<RRREMRRA= = = = = T1az::::a4Kr$   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc                    t          j        d          }t          j        ||          }t          j        ||          }||z  }||z  }||z  }t          ||z
  |          }|||z  z   }||z  |z  } ||z  t          j        d|          z   |z  }!| |z  t          j        d|          z   |z  }"t          j        d|          }#| |!dddf         |
z  |#dddf         |z  z   z   }$||#dddf         |z  |"dddf         |z  z   z   }%||!|z  z   }&|"|z  }'||'|z  z   }(t          j        ||ft           j                  })t          dt          j        ||                    D ]}*t          j        |$|#dddf         ||*|z  z
  k     d          }+t          j        |%|#dddf         ||*|z  z
  k     d          },|*|z  }-|-|	z  }.t          j        |&|.|z  z             }/t          j        |(|.|z  z             }0|)t          j	        |+|,          |/dddf         z  |0dddf         z  z  })|$||z  z  }$|%||z  z  }%|j
        j        t           j        k    r |)                    t           j                  }1nY|j
        j        t           j        k    r |)                    t           j                  }1n|)                    t           j                  }1||z  t          j        d|          z   }2| |z  t          j        d|          z   }3|||2dddf         z  z   ||3dddf         z  z   }4|2dddf         |k     |3dddf         |k     z  }5t          j        |4|1|5           dS )zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with block-wise quantization, and
    store the result in output tensor `C`.
    r   r   Nr*   g        )maskother)r>   )r   r   r-   minr   zerosr   ranger   dotr   r   bfloat16r   float16r   )6ABCAsBsMNKgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nr9   r:   r;   r<   r   	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsaccumulatorkabk_startoffs_ksa_sb_scoffs_cmoffs_cnc_ptrsc_masks6                                                         r"   _w8a8_block_fp8_matmulrw   >   s   J -Q


C<((I<((I#i/&&H\)Ky;.==L3-.E##4E|#bi<&@&@@AEG|#bi<&@&@@AEGYq,''F'!!!T'"Y.aaa91LLMF&D/I-aaa0@90LLMF7[((G'!H8k))G(L,7rzJJJK1bga..// + +GFaaa1q<7G3G!GsSSSGF41q<7G3G!GsSSSl"W$gg+ 5566gg+ 5566rva||c!!!T'l2Sqqq\AA,**,**wR[((NN2;''	
	rz	)	)NN2:&&NN2:&&l"RYq,%?%??Gl"RYq,%?%??GWQQQW---	GD!!!G<L0LLFaaag"wtQQQw'7!';<FHVQV$$$$$$r$   rF   rG   rI   rJ   output_dtypec                 R   t          |          dk    sJ |d         |d         }}| j        d         |j        d         k    sJ | j        dd         |j        dd         k    r|                                 sJ t          j        | j        d         |          |j        d         k    sJ |                                 | j        d         z  |j        dk    r|                                r|j        dk    sJ |j        \  }t          j        |          |j        d         k    sJ t          j        ||          |j        d         k    sJ | j        dd         fz   }	|                     |	|          }
d}|k     r$t          j                  }t          |d          }|}||z  dk    sJ |}fd	}t          |         | ||
||||||                     d
          |                     d          |                    d          |                    d          |
                    d
          |
                    d          |                    d
          |                    d          |                    d          |                    d          |||d           |
S )a  This function performs matrix multiplication with block-wise
    quantization.
    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.
    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. It should
        be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.
    Returns:
        torch.Tensor: The result of matmul.
    r   r      r)   Nr*   r%      c                 t    t          j        | d                   t          j        | d                   z  fS )Nr9   r:   )r,   r-   )METArK   rL   s    r"   r0   z*w8a8_block_fp8_matmul_triton.<locals>.grid   s3    AtN344v{1d>FZ7[7[[]]r$      )r9   r:   r;   r<   )lenr2   r1   r,   r-   r.   ndimr6   next_power_of_2r   rw   stride)rF   rG   rI   rJ   r&   rx   block_nblock_krM   C_shaperH   r9   r;   r:   r0   rK   rL   s                  @@r"   w8a8_block_fp8_matmul_tritonr      s   . z??a!!}jmWG72;!'"+%%%%73B3<28CRC=((Q__->->((>;qwr{G,,<<<<			QWR[ A6Q;;1??,,;A=7DAq;q'""bhqk1111;q'""bhqk1111gcrclaT!G	G<00AL<-a00<,,L\!Q&&&&L^ ^ ^ ^ ^ ^ 4 			

									
		"
		"
		!
		!!!!1   6 Hr$   input_qweight_qinput_scaleweight_scalec                    | j         dk    r| j        nd| j        d         | j        d         f\  }}}|j        d         }	|                     d|          }
|                    |j        d         d          }|	|d         z  }||d         z  }t          j        ||z  |	ft          j        | j                  }t          |          D ]}||d         z  }||d         z   }t          |          D ]}||d         z  }||d         z   }|
dd||f         }|||||f         }|dd||dz   f         }|||f         }t          j        ||	                                t          j
        dt          j        | j                  ||          |z  }|dd||fxx         |z  cc<   |                    |||	          }|                    |          S )a  
    Performs blocked matrix multiplication with FP8 quantized matrices.

    Args:
        input_q: Quantized input tensor with 1x128 block quantization
        weight_q: Quantized weight tensor with 128x128 block quantization
        input_scale: Scaling factors for input blocks
        weight_scale: Scaling factors for weight blocks
        block_size: Tuple of (M, N) for weight block dimensions
        output_dtype: Desired output dtype
       rz   r   r)   r   deviceN)scale_ascale_b	out_dtype)r   r2   viewr3   rA   r   r   rB   
_scaled_mmttensorr   )r   r   r   r   r&   rx   
batch_sizeseq_len
hidden_dimout_featuresinput_reshapedinput_scale_reshapednum_weight_blocks_mnum_weight_blocks_noutputim_startm_endjn_startn_endinput_blockweight_blockcurr_input_scalecurr_weight_scaleblock_results                             r"   w8a8_block_fp8_matmul_compiler      s4   ( 8?|q7H7HgmmqRYR_`aRbdkdqrsdtNu#J>!$L \\"j11N&++K,=a,@"EE&*Q-7$
15[*w.=U][b[ijjjF&'' 5 5jm#*Q-'*++ 	5 	5A*Q-'Gjm+E )GEM)9:K#GEM75=$@AL  4AAAq1q5yLA ,QT 2   NN$$!L%-WWW-*   ##  111gem#$$$4$$$$/	52 [[Wl;;F99\"""r$   c                        e Zd Zej        Z	 	 	 	 	 ddedededee	eef                  f fdZ
d	ej        d
ej        fdZ xZS )	FP8LinearFNdynamicin_featuresr   biasr&   c                    t                                          ||           || _        || _        t          j                            t	          j        ||t          j	        |                    | _
        | j
                                        dk    rh||d         z   dz
  |d         z  }||d         z   dz
  |d         z  }	t          j        t	          j        ||	t          j        |                    | _        n|                     dd            || _        || _        |r2t          j        t	          j        | j                            | _        d S |                     dd            d S )Nr   rz   r   weight_scale_invr   )super__init__r   r   r3   nn	Parameteremptyr   r   weightelement_sizer   r   register_parameterr&   activation_schemer   )selfr   r   r   r   r&   r   r   scale_out_featuresscale_in_features	__class__s             r"   r   zFP8Linear.__init__)  sS    	l333&(h((\;V_Vent)u)u)uvv;##%%**".A">"BzRS}!T!,z!}!<q!@ZPQ] R$&L.0A_efff% %D!! ##$6===$!2 	2U[1B%C%CDDDIII##FD11111r$   inputr'   c           	         | j                                         dk    r t          j        || j         | j                  S t                      r#t          j                                        j	        nd}t          t          |t          j                  }|                    |j                  5  t          || j        d                   \  }}t          || j         || j        | j        |j                  }d d d            n# 1 swxY w Y   |                                 | j        
|| j        z   }|                    |j                  S )Nrz   cuda)rx   r*   )r   r   Flinearr   r   r3   acceleratorcurrent_acceleratortypegetattrr   r   r8   r&   r   r   r   synchronizer   )r   r   device_typetorch_accelerator_moduleqinputscaler   s          r"   forwardzFP8Linear.forwardK  se   ;##%%))8E4;	::: KiJjJjv%+??AAFFpvK'.uk5:'N'N$)00>> 	 	 )%1C D D5K)O!&  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 %00222y$$)+995;9///s   +AC??DD)FNNNr   )__name__
__module____qualname__r3   r5   r   intboolr   tupler   Tensorr   __classcell__)r   s   @r"   r   r   &  s        E 04# 2  2 2  2 	 2 U38_- 2  2  2  2  2  2D0U\ 0el 0 0 0 0 0 0 0 0r$   r   Fc                   	 |g }|                                  D ]A\  }}|                    |           t          |t          j                  r||pg vrd                    |          	t          	fd|pg D                       sut                      5  t          |j	        |j
        |j        du|j        j        |j        j        |j        |j                  | j        |<   d}ddd           n# 1 swxY w Y   t%          t'          |                                                    dk    rt+          ||||||          \  }}|                    d           C| |fS )	z%Replace Linear layers with FP8Linear.N.c              3       K   | ]}|v V  	d S )N ).0keycurrent_key_name_strs     r"   	<genexpr>z+_replace_with_fp8_linear.<locals>.<genexpr>u  s)      ]]ss22]]]]]]r$   )r   r   r   r   r   r   r&   Tr   )has_been_replacedr)   )named_childrenappend
isinstancer   Linearjoinanyr
   r   r   r   r   r   r   r   r   weight_block_size_modulesr   listchildren_replace_with_fp8_linearpop)
modeltp_planmodules_to_not_convertcurrent_key_namequantization_configr   namemodule_r   s
            @r"   r   r   d  s    ,,.. ! !f%%%fbi(( 	-T:P:VTV-W-W#&88,<#=#= ]]]]?U?[Y[]]]]] -')) 
- 
-+4$*$6%+%8#[4%}3$m1*=*O#6#H, , ,EN4( )-%
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- tFOO%%&&''!++#;& #"3$ $ $ A  	R    ###s   AC11C5	8C5	c                     |dgn|}|j         |                    |j                    t          t          |                    }t	          | | j        ||          \  } }|st                              d           | S )z:Helper function to replace model layers with FP8 versions.Nlm_head)r   r   r   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.)r   extendr   setr   _tp_planloggerwarning)r   r   r   r   s       r"   replace_with_fp8_linearr     s     -C,Ji[[Pf1=%%&9&PQQQ!#&<"="=>>75/	     E  
<	
 	
 	

 Lr$   )r%   )NNNNF)NN)'typingr   utilsr   r   r   r   r3   torch.nnr   r,   triton.languagelanguager   r	   r   
accelerater
   
get_loggerr   r   jit	constexprr#   r   r   r   r8   rw   r   r   r   r   compiler   r   r   r   r   r   r$   r"   <module>r     s          h h h h h h h h h h h h  )LLLMMM      (((((( .------ 
	H	%	% bl    
 
 
3 
u|U\?Y9Z 
 
 
 
 Q%4 ,5Q%6 ,7Q%8 ,9Q%: ,;Q% Q% Q% Q%t !&M M|M|M 	M 		M
 S	M +M \M M M Mb  -1 %># >#\>#l># ># ,	>#
 sCx)># +># \># ># ># >#B;0 ;0 ;0 ;0 ;0	 ;0 ;0 ;0@ +$ +$ +$ +$`       r$   