
    .`i$                         d dl Z d dlZd dlmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZmZmZ erd dlmZ d dlmZ ndZ ee          Z G d	 d
e          ZdS )    N)TYPE_CHECKINGOptional)init_logger)AttentionBackendEnum   )DeviceCapabilityPlatformPlatformEnum)
VllmConfig)AttentionSelectorConfigc                      e Zd ZU ej        ZdZeed<   dZ	eed<   dZ
eed<   dZeed<   dZeed	<   d
Zeed<   ed4d            Zedddddefd            Zeded         fd            Ze	 d5dedej        ded         ddfd            Zedej        ddfd            Ze	 d6dededz  fd            Zed6dedefd            Zedefd            Zed6dedefd             Zed!             Zed"e ddfd#            Z!ede"fd$            Z#ede"fd%            Z$ed&             Z%e	 d5dej&        j'        dz  de(fd'            Z)edej        fd(            Z*ede"fd)            Z+edefd*            Z,edefd+            Z-edej        fd,            Z.ede"fd-            Z/ed.ej0        d/ej0        d0ej0        d1ej0        ddf
d2            Z1ed.ej0        d/ej0        d0ej0        d1ej0        ddf
d3            Z2dS )7XPUPlatformxpudevice_namedevice_typeXPUdispatch_keyGPUray_device_keyccldist_backendZE_AFFINITY_MASKdevice_control_env_varreturnNc                 r    t          j        t                    5  dd l}d d d            d S # 1 swxY w Y   d S )Nr   )
contextlibsuppressImportErrorvllm._moe_C)clsvllms     f/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/platforms/xpu.pyimport_kernelszXPUPlatform.import_kernels#   s      -- 	 		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   ,00selected_backendr   attn_selector_configr   c                    ddl m}  |d           t                              d           |j        }|j        rt          d          |t          j        k    r8t          	                    d           t          j        
                                S |t          j        k    r8t                              d           t          j        
                                S |t          j        k    r8t          	                    d           t          j        
                                S |rt          d	| j         d
|j                   t                              d           t          j        
                                S )Nr   )set_kv_cache_layoutNHDzeSetting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; only NHD layout is supported by XPU attention kernels.z)Sparse Attention is not supported on XPU.zUsing Triton backend.z`Flash Attention on XPU does not support float32 dtype. Falling back to Triton Attention backend.zUsing Flash Attention backend.zInvalid attention backend for z, with use_mla: ) vllm.v1.attention.backends.utilsr'   loggerinfodtype
use_sparseNotImplementedErrorr   TRITON_ATTN	info_onceget_pathtorchfloat32warning_once
FLASH_ATTN
ValueErrorr   use_mla)r    r$   r%   r'   r,   s        r"   get_attn_backend_clsz XPUPlatform.get_attn_backend_cls)   s    	IHHHHHE"""E	
 	
 	

 %** 	S%&QRRR3???4555'3<<>>>em##<   (3<<>>>!5!@@@=>>>'2;;=== 	@ @ @!5!=@ @  
 	4555#.77999    c                     t           j        gS N)r   r5   r    s    r"   get_supported_vit_attn_backendsz+XPUPlatform.get_supported_vit_attn_backendsO   s     !+
 	
r9   	head_sizer,   backendc                     |Y||                                  v s#J d| d|                                   d            t                              d| d           |S t                              dt          j         d           t          j        S )NzBackend z= is not supported for vit attention. Supported backends are: .zUsing backend z for vit attention)r=   r*   r0   r   r5   )r    r>   r,   r?   s       r"   get_vit_attn_backendz XPUPlatform.get_vit_attn_backendV   s     cAACCCCC<7 < <6688< < < DCC
 IgIIIJJJNP1<PPP	
 	
 	
 $..r9   devicec                 D    t           j                            |           dS )z:
        Set the device for the current platform.
        N)r2   r   
set_devicer    rC   s     r"   rE   zXPUPlatform.set_devicek   s     
 		V$$$$$r9   r   	device_idc                     d S r;    r    rG   s     r"   get_device_capabilityz!XPUPlatform.get_device_capabilityr   s	     tr9   c                 @    t           j                            |          S r;   )r2   r   get_device_namerJ   s     r"   rM   zXPUPlatform.get_device_name{   s    y((333r9   c                 @    t          j        dd          dk    }|sdS dS )NXPU_USE_TRITON_KERNEL01z4vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPUz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU)osgetenv)r    xpu_use_triton_kernels     r"   get_punica_wrapperzXPUPlatform.get_punica_wrapper   s/     "	*A3 G G3 N$ 	JIIIIr9   c                 N    t           j                            |          }|j        S r;   )r2   r   get_device_propertiestotal_memory)r    rG   device_propss      r"   get_device_total_memoryz#XPUPlatform.get_device_total_memory   s     y66yAA((r9   c                 (    t          j                    S r;   )r2   no_gradr<   s    r"   inference_modezXPUPlatform.inference_mode   s    }r9   vllm_configc                    |j         }|j        }|r|j        d|_        ddlm}m} |j        }|j        g |_        |j        |j	        k    s
J d            |j
        |j	        |_        |j        dt          j        d<   |j        }|j        dk    rd|_        |j        d	|j        _        |r]|j        rXt(                              d
           d|j        _        t1          |j        j        |j        j                  |j        _        d S d S d S )N@   r   )CompilationModeCUDAGraphModez%CUDA graph mode should be NONE on XPU16IGC_ForceOCLSIMDWidthautoz#vllm.v1.worker.xpu_worker.XPUWorkerTz`MLA is enabled on a non-GPU platform; forcing chunked prefill and prefix caching to be disabled.F)cache_configmodel_config
block_sizevllm.configra   rb   compilation_configcompile_sizescudagraph_modeNONElora_configmodespeculative_configrR   environparallel_config
worker_clskv_transfer_configenable_permute_local_kvr7   r*   r+   scheduler_configenable_chunked_prefillmaxmax_model_lenDEFAULT_MAX_NUM_BATCHED_TOKENSmax_num_batched_tokens)r    r^   rf   rg   ra   rb   rj   rr   s           r"   check_and_update_configz#XPUPlatform.check_and_update_config   sb   "/"/ 	)L3;&(L# 	?>>>>>>>(;+3/1,!0M4FFFF3 GFF ".&5&:#)526BJ./%5 %//)NO&)5EIK*B 		L0 		KK=   CHK(?BE(6,KC CK(???		 		 		 		r9   c                     dS NTrI   r<   s    r"   support_hybrid_kv_cachez#XPUPlatform.support_hybrid_kv_cache       tr9   c                     dS )NFrI   r<   s    r"   support_static_graph_modez%XPUPlatform.support_static_graph_mode   s    ur9   c                     dS r~   rI   r<   s    r"   is_pin_memory_availablez#XPUPlatform.is_pin_memory_available   r   r9   c                 ~    t           j                            |           t           j                            |          S r;   )r2   r   reset_peak_memory_statsmax_memory_allocatedrF   s     r"   get_current_memory_usagez$XPUPlatform.get_current_memory_usage   s0     		))&111y--f555r9   c                     t           j        S r;   )r2   float8_e5m2r<   s    r"   	fp8_dtypezXPUPlatform.fp8_dtype   s      r9   c                     |                                                                  }|                    d          dk    S )Nzdata center gpur   )rM   lowercount)r    r   s     r"   is_data_center_gpuzXPUPlatform.is_data_center_gpu   s9    ))++1133  !233a77r9   c                     dS )NzFvllm.distributed.device_communicators.xpu_communicator.XpuCommunicatorrI   r<   s    r"   get_device_communicator_clsz'XPUPlatform.get_device_communicator_cls   s    WWr9   c                 >    t           j                                        S r;   )r2   r   device_countr<   s    r"   r   zXPUPlatform.device_count   s    y%%'''r9   c                     |t           j        k    rN|                                                                 }|                    d          dk    rt          d          d S d S )Na770r   zIntel Arc A770 have bfloat16 accuracy known issue. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)r2   bfloat16rM   r   r   r6   )r    r,   r   s      r"   check_if_supports_dtypez#XPUPlatform.check_if_supports_dtype   sm    EN""--//5577K  ((1,, F  	 #" -,r9   c                     dS r~   rI   r<   s    r"   opaque_attention_opzXPUPlatform.opaque_attention_op   r   r9   	src_cache	dst_cachesrc_block_indicesdst_block_indicesc                 `    |dd|f         }|                     |j                  |dd|f<   dS )z/Copy blocks from src_cache to dst_cache on XPU.N)torC   r    r   r   r   r   
_src_caches         r"   insert_blocks_to_devicez#XPUPlatform.insert_blocks_to_device   sA     qqq"334
*4--	8H*I*I	!!!&&'''r9   c                 T    |dd|f         }|                                 |dd|f<   dS )z#Copy blocks from XPU to host (CPU).N)cpur   s         r"   swap_out_blocks_to_hostz#XPUPlatform.swap_out_blocks_to_host   s<     qqq"334
*4..*:*:	!!!&&'''r9   )r   Nr;   )r   )3__name__
__module____qualname__r
   r   _enumr   str__annotations__r   r   r   r   r   classmethodr#   r8   listr=   intr2   r,   r   rB   rC   rE   r   rK   rM   rU   rZ   r]   r   r|   boolr   r   r   typesDevicefloatr   r   r   r   r   r   r   Tensorr   r   rI   r9   r"   r   r      s        EKKL#  NCL#"4C444   [
 #:0#: 8#: 
	#: #: #: [#:J 
5K0L 
 
 
 [
 
 59	/ // {/ 01	/
 
 / / / [/( % % % % % [%    
D	    [ 4 4 4C 4 4 4 [4 J3 J J J [J ) ) )C ) ) ) [)   [ )* ) ) ) ) [)V     [ $    [   [ 156 6['$.6	6 6 6 [6 !%+ ! ! ! [! 84 8 8 8 [8 XC X X X [X (S ( ( ( [( 	EK 	 	 	 [	 D    [ 	J<	J <	J !<		J
 !<	J 
	J 	J 	J [	J 	;<	; <	; !<		;
 !<	; 
	; 	; 	; [	; 	; 	;r9   r   )r   rR   typingr   r   r2   vllm.loggerr   #vllm.v1.attention.backends.registryr   	interfacer   r	   r
   ri   r   vllm.v1.attention.selectorr   r   r*   r   rI   r9   r"   <module>r      s        				 * * * * * * * *  # # # # # # D D D D D D ? ? ? ? ? ? ? ? ? ? &&&&&&BBBBBBBJ	X		n; n; n; n; n;( n; n; n; n; n;r9   