
    .`iZ                        d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	m
Z
 ddlZddlmZ ddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ erddlmZ ddlmZ ddl m!Z! ndZdZ ee"          Z# ed          Z$ e
d          Z% e            Z&ej'        j(        )                    d           ede*dede+e         fd            Z,dee$e%f         dee$e%f         fdZ- G d de          Z. G d de.          Z/ G d de.          Z0dZ1	 	 e&2                                 dZ1n# e3$ r dZ1Y nw xY we1re&4                                 n# e1re&4                                 w w xY we1re/ne0Z5e56                                 dS ) z~Code inside this file can safely assume cuda platform, e.g. importing
pynvml. However, it should not initialize cuda context.
    N)Callable)cachewraps)TYPE_CHECKINGOptionalTypeVar)	ParamSpec)init_logger)import_pynvmlcuda_device_count_statelessAttentionBackendEnum   )DeviceCapabilityPlatformPlatformEnum)
VllmConfig)
CacheDType)AttentionSelectorConfig_P_RFuse_mladevice_capabilityreturnc                    | r|j         dk    rDt          j        t          j        t          j        t          j        t          j        t          j        gS t          j        t          j        t          j        t          j        t          j        gS |j         dk    r.t          j        t          j	        t          j
        t          j        gS t          j	        t          j        t          j
        t          j        gS )zEGet backend priorities with lazy import to avoid circular dependency.
   )majorr   FLASHINFER_MLACUTLASS_MLAFLASH_ATTN_MLAFLASHMLA
TRITON_MLAFLASHMLA_SPARSE
FLASHINFER
FLASH_ATTNTRITON_ATTNFLEX_ATTENTION)r   r   s     g/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/platforms/cuda.py_get_backend_prioritiesr*   ,   s       "b(($3$0$3$-$/$4  %3$-$3$/$4  "b(($/$/$0$3	  %/$/$0$3	     fnc                 |     t                     dt          j        dt          j        dt          f fd            }|S )Nargskwargsr   c                      t                                            	  | i |t                                            S # t                                            w xY wN)pynvmlnvmlInitnvmlShutdown)r.   r/   r,   s     r)   wrapperz"with_nvml_context.<locals>.wrapperV   sW    	"2t&v&&!!!!F!!!!s	   = A)r   r   r.   r/   r   )r,   r5   s   ` r)   with_nvml_contextr6   U   sQ    
2YY"rw "") " " " " " " Y" Nr+   c                      e Zd ZU ej        ZdZeed<   dZ	eed<   dZ
eed<   dZeed<   dZeed	<   d
Zeed<   edeej                 fd            Zedej        ddfd            Zed9dededz  fd            Zed9dedefd            Zed9dedefd            Zedee         defd            Zed             Zed:d            Ze	 d;dej        j         dz  de!fd            Z"ededdde#ee#d ef                  e$d ee         f         f         fd!            Z%ed"d dddefd#            Z&eded          fd$            Z'e	 d;d%ed&ej        d'e(d          dd fd(            Z)edefd)            Z*edefd*            Z+edefd+            Z,edefd,            Z-edefd-            Z.edefd.            Z/edefd/            Z0ed&ej        fd0            Z1ed1ej2        d2ej2        d3ej2        d4ej2        ddf
d5            Z3ed1ej2        d2ej2        d3ej2        d4ej2        ddf
d6            Z4edefd7            Z5edefd8            Z6dS )<CudaPlatformBasecudadevice_namedevice_typeCUDAdispatch_keyGPUray_device_keynccldist_backendCUDA_VISIBLE_DEVICESdevice_control_env_varr   c                     |                      d          r#t          j        t          j        t          j        gS |                      d          rt          j        t          j        gS t          j        gS )NP   <   )has_device_capabilitytorchbfloat16float16float32)selfs    r)   supported_dtypesz!CudaPlatformBase.supported_dtypesj   s]    %%b)) 	BNEM5=AA%%b)) 	2M5=11 r+   deviceNc                 p    t           j                            |           t          j        d|          }dS )z:
        Set the device for the current platform.
        r   )rN   N)rH   r9   
set_devicezeros)clsrN   _s      r)   rP   zCudaPlatformBase.set_devicev   s4    
 	
f%%% K&)))r+   r   	device_idc                     t           r1   NotImplementedErrorrR   rT   s     r)   get_device_capabilityz&CudaPlatformBase.get_device_capability       !!r+   c                     t           r1   rV   rX   s     r)   get_device_namez CudaPlatformBase.get_device_name   rZ   r+   c                     t           r1   rV   rX   s     r)   get_device_total_memoryz(CudaPlatformBase.get_device_total_memory   rZ   r+   
device_idsc                     t           r1   rV   )rR   r_   s     r)   is_fully_connectedz#CudaPlatformBase.is_fully_connected   rZ   r+   c                     d S r1    rR   s    r)   log_warningszCudaPlatformBase.log_warnings   s    r+   vllm_configr   c                    ddl m} |j        }|j        }|j        dk    rd|_        |j        }|r|j        d|_        ||j        r|j        t          |j        j	        d          }d}d}d}	ddl
m}
 |j        j        w|j        }t          |d	d
          }|                     d          r|s|dk    rd}	|j        |j        _        n[|                     d          r|sd}nA |
            d         rd}n.n-|j        j        }||j        k    }||j        k    }||j        k    }	|r? |
            d         r/|j        dz  dk    r!d|_        t(                              d           |r/|j        dz  dk    r!d|_        t(                              d           |	r:|j        dk    r/|j        dz  dk    r!d|_        t(                              d           |r,|j        dk    r!d|_        t(                              d           |j        }|8|j        r3|j        r.|j        s)t(                              d           d|_        d S d S d S d S d S )Nr   r   autoz vllm.v1.worker.gpu_worker.Worker   
index_topkF)is_flashmla_dense_supportedqk_nope_head_dimr   d      T@   z7Forcing kv cache block size to 64 for FlashMLA backend.z;Forcing kv cache block size to 128 for CUTLASS_MLA backend.    z<Forcing kv cache block size to 64 for FlashInferMLA backend.z=Forcing kv cache block size to 64 for FlashMLASparse backend.zVForcing --disable_chunked_mm_input for models with multimodal-bidirectional attention.)#vllm.v1.attention.backends.registryr   parallel_configmodel_config
worker_clscache_config
block_sizer   hasattr	hf_configvllm.v1.attention.ops.flashmlark   attention_configbackendhf_text_configgetattris_device_capability_familyr   r"   r    loggerinfoscheduler_configis_mm_prefix_lmis_multimodal_modeldisable_chunked_mm_inputwarning)rR   rf   r   rr   rs   ru   
use_sparseuse_flashmlause_cutlass_mlause_flashinfer_mlark   r|   rl   r{   r   s                  r)   check_and_update_configz(CudaPlatformBase.check_and_update_config   s.   LLLLLL%5"/%//)KO&"/ 	)L3;&(L# $$ %'3 !9!C\RRJ
 !L#O!&RRRRRR+3;!-!<#*>;Mq#Q#Q 33C88& )C// *.& -;  088 44S99 * &*OO002215 #'LL  &6>&*>*GG")-A-M"M%,0D0S%S" W//11!4W !+b0A55*,'UVVV <#:S#@A#E#E*-'Q  
 # +r11 +b0A55*,'R  
  l5;;*,'S   '7 $, % 4 % %= %
 NN;   9=555 %$$$$$$$r+   c                     t           j                                         t           j                            |           t           j                            |          S r1   )rH   r9   empty_cachereset_peak_memory_statsmax_memory_allocated)rR   rN   s     r)   get_current_memory_usagez)CudaPlatformBase.get_current_memory_usage  sD     	
   
**6222z..v666r+   r   attn_selector_configr   r   c                 8   g }i }t          |j        |          }t          |          D ]n\  }}	 |                                } |j        dd|i|                                }	n# t          $ r dg}	Y nw xY w|	r|	||<   W|                    ||f           o||fS )Nr   ImportErrorrc   )r*   r   	enumerate	get_classvalidate_configuration_asdictr   append)
rR   r   r   valid_backends_prioritiesinvalid_reasonsbackend_prioritiespriorityr{   backend_classinvalid_reasons_is
             r)   get_valid_backendsz#CudaPlatformBase.get_valid_backends  s     %'!4 (*;
 
 "++=!>!> 	F 	FHg4 ' 1 1 3 3$HM$H % %&7%*2244% %!!  4 4 4%2O!!!4  F+<(()00'81DEEEE(/99s   5A%%A54A5selected_backendc           	      &  
 |                                  }|J |                    d           }|	 |                                } |j        dd|i|                                }n# t
          $ r dg}Y nw xY w|rt          d| d|           t                              d|           |	                                S | 
                    ||          \  
}dd	                    d
 |                                D                       z   dz   }|                                }t                              d| j         d| d| d           t!          
          dk    rt          d| j         d| d| d          t#          t%          t!          
                    
fd          }|d         }	
|	         d         }t                              d|j        t+          d 
D                       d           |	                                S )N)rv   r   r   zSelected backend z. is not valid for this configuration. Reason: zUsing %s backend.)r   r   {, c              3   \   K   | ]'\  }}|j          d d                    |           dV  (dS )z: [r   ]N)namejoin).0r{   reasonss      r)   	<genexpr>z8CudaPlatformBase.get_attn_backend_cls.<locals>.<genexpr>S  sY        $GW <99DIIg$6$6999     r+   }z*Some attention backends are not valid for z with z. Reasons: .r   z%No valid attention backend found for c                      |          d         S )Nr   rc   )ir   s    r)   <lambda>z7CudaPlatformBase.get_attn_backend_cls.<locals>.<lambda>h  s    3A6q9 r+   )keyz8Using %s attention backend out of potential backends: %sc              3   0   K   | ]}|d          j         V  dS )r   N)r   )r   bs     r)   r   z8CudaPlatformBase.get_attn_backend_cls.<locals>.<genexpr>o  s(      ??!A$)??????r+   local)scoperc   )rY   _replacer   r   r   r   
ValueErrorr   r   get_pathr   r   items__repr__
debug_oncer:   lensortedrange	info_oncer   tuple)rR   r   r   r   r   r   reasons_str
config_strsorted_indicesselected_indexr   s             @r)   get_attn_backend_clsz%CudaPlatformBase.get_attn_backend_cls.  s     5577 ,,,3<<<MM'2 0 : : < <"F-"F # #&7#*2244# #  2 2 2#0/2 3 E(8 E E3BE E  
 /1ABBB'00222 695K5K/!5 6L 6
 6
2!?
 ii  (7(=(=(?(?    
  	 *2244
5 5 55 5&15 5 5	
 	
 	
 ())Q..> > >"> >/:> > >    #/00119999
 
 
 (*4^DQGF!??%>?????	 	 	
 	
 	
  ((***s   5A) )A98A9c                 2    t           j        t           j        gS r1   )r   
TORCH_SDPAr&   rd   s    r)   get_supported_vit_attn_backendsz0CudaPlatformBase.get_supported_vit_attn_backendsu  s     !+ +
 	
r+   	head_sizedtyper{   c                    |X||                                  v s"J d| d|                                               t                              d| d           |S |                                 x}rq|j        dk    rf	 t
          j                                        }|                    |          r!|	                    |          rt
          j        S n# t          $ r Y nw xY wt
          j        S )NzBackend z= is not supported for vit attention. Supported backends are: zUsing backend z for vit attention   )r   r   r   rY   r   r   r&   r   supports_head_sizesupports_dtyper   r   )rR   r   r   r{   ccr   s         r)   get_vit_attn_backendz%CudaPlatformBase.get_vit_attn_backend|  s,    cAACCCCCS7 S S+.+N+N+P+PS S DCC IgIIIJJJN ++---B 	28q== 4 ? I I K K 33  ;#22599; 0::    $..s   =AC 
CCc                     dS )Nz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPUrc   rd   s    r)   get_punica_wrapperz#CudaPlatformBase.get_punica_wrapper  s    EEr+   c                     	 dS )NzHvllm.distributed.device_communicators.cuda_communicator.CudaCommunicatorrc   rd   s    r)   get_device_communicator_clsz,CudaPlatformBase.get_device_communicator_cls  s     W	
 	
r+   c                 ,    |                      d          S )NY   )rG   rd   s    r)   supports_fp8zCudaPlatformBase.supports_fp8  s    ((,,,r+   c                     dS NTrc   rd   s    r)   use_custom_allreducez%CudaPlatformBase.use_custom_allreduce      tr+   c                     dS r   rc   rd   s    r)   opaque_attention_opz$CudaPlatformBase.opaque_attention_op  r   r+   c                     dS )Nz,vllm.compilation.cuda_graph.CUDAGraphWrapperrc   rd   s    r)   get_static_graph_wrapper_clsz-CudaPlatformBase.get_static_graph_wrapper_cls  s    ==r+   c                     t                      S r1   r   rd   s    r)   device_countzCudaPlatformBase.device_count  s    *,,,r+   c                    |t           j        k    rq|                     d          s^|                                 }|                                 }|d}n|                                }d| }t          d| d| d          d S d S )NrE   z"does not have a compute capabilityzhas compute capability zQBfloat16 is only supported on GPUs with compute capability of at least 8.0. Your z GPU zg. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)rH   rI   rG   rY   r\   as_version_strr   )rR   r   
capabilitygpu_namecompute_strversion_strs         r)   check_if_supports_dtypez(CudaPlatformBase.check_if_supports_dtype  s    EN"",,R00  6688
..00%"FKK",";";"="=K"IK"I"IK F$F F+6F F F   #" r+   	src_cache	dst_cachesrc_block_indicesdst_block_indicesc                 `    |dd|f         }|                     |j                  |dd|f<   dS )z/Copy blocks from src_cache to dst_cache on GPU.N)torN   rR   r   r   r   r   
_src_caches         r)   insert_blocks_to_devicez(CudaPlatformBase.insert_blocks_to_device  sA     qqq"334
*4--	8H*I*I	!!!&&'''r+   c                 T    |dd|f         }|                                 |dd|f<   dS )z#Copy blocks from GPU to host (CPU).N)cpur   s         r)   swap_out_blocks_to_hostz(CudaPlatformBase.swap_out_blocks_to_host  s<     qqq"334
*4..*:*:	!!!&&'''r+   c                     dS r   rc   rd   s    r)   support_hybrid_kv_cachez(CudaPlatformBase.support_hybrid_kv_cache  r   r+   c                     dS r   rc   rd   s    r)   support_static_graph_modez*CudaPlatformBase.support_static_graph_mode  r   r+   r   )rf   r   r   Nr1   )7__name__
__module____qualname__r   r<   _enumr:   str__annotations__r;   r=   r?   rA   rC   propertylistrH   r   rM   classmethodrN   rP   intr   rY   r\   r^   boolra   re   r   typesDevicefloatr   r   dictr   r   r   r   r   r   r   r   r   r   r   r   r   Tensorr   r   r   r   rc   r+   r)   r8   r8   a   s        EKKL#NCL#"8C888	$u{"3 	 	 	 X	 * * * * * [* " "c ":JT:Q " " " [" " " "C " " " [" " " "C " " " [" "DI "$ " " " ["   [ o= o= o= [o=b 157 7['$.7	7 7 7 [7 :+: 8: 
U)3./0#T#Y./	1
	: : : [:< D+0D+ 8D+ 
	D+ D+ D+ [D+L 
5K0L 
 
 
 [
 
 59	/ // {/ 01	/
 
 / / / [/6 F3 F F F [F 
C 
 
 
 [

 -T - - - [- T    [ D    [ >S > > > [> -S - - - [- EK    [( 	J<	J <	J !<		J
 !<	J 
	J 	J 	J [	J 	;<	; <	; !<		;
 !<	; 
	; 	; 	; [	;     [ $    [  r+   r8   c            	           e Zd Zeeeddededz  fd                                    Zee	 dde	eef         ez  dede
f fd                        Zeeddedefd                        Zeeddedefd	                        Zeeddedefd
                        Zeedee         de
fd                        Zeddedefd            Zeed                         Z xZS )NvmlCudaPlatformr   rT   r   Nc                     	 |                      |          }t                              |          }t                              |          \  }}t	          ||          S # t
          $ r Y d S w xY wN)r   minor)device_id_to_physical_device_idr2   nvmlDeviceGetHandleByIndex"nvmlDeviceGetCudaComputeCapabilityr   RuntimeError)rR   rT   physical_device_idhandler   r  s         r)   rY   z&NvmlCudaPlatform.get_device_capability  s{    	!$!D!DY!O!O667IJJF!DDVLLLE5#%u==== 	 	 	44	s   AA 
A-,A-r   c                 l    	 t                                          ||          S # t          $ r Y dS w xY w)NF)superrG   r  )rR   r   rT   	__class__s      r)   rG   z&NvmlCudaPlatform.has_device_capability  sD    	7700YGGG 	 	 	55	s   !% 
33c                 V    |                      |          }|                     |          S r1   )r  _get_physical_device_name)rR   rT   r  s      r)   r\   z NvmlCudaPlatform.get_device_name	  s-     !@@KK,,-?@@@r+   c                     |                      |          }t                              |          }t                              |          S r1   )r  r2   r  nvmlDeviceGetUUIDrR   rT   r  r  s       r)   get_device_uuidz NvmlCudaPlatform.get_device_uuid  s@     !@@KK223EFF''///r+   c                     |                      |          }t                              |          }t          t                              |          j                  S r1   )r  r2   r  r  nvmlDeviceGetMemoryInfototalr  s       r)   r^   z(NvmlCudaPlatform.get_device_total_memory  sK     !@@KK223EFF611&99?@@@r+   physical_device_idsc                 Z   d |D             }t          |          D ]\  }}t          |          D ]y\  }}||k     rn	 t                              ||t          j                  }|t          j        k    r  dS G# t          j        $ r  t                              d           Y   dS w xY wzdS )zP
        query if the set of gpus are fully connected by nvlink (1 hop)
        c                 B    g | ]}t                               |          S rc   )r2   r  )r   r   s     r)   
<listcomp>z7NvmlCudaPlatform.is_fully_connected.<locals>.<listcomp>#  s&    UUUA644Q77UUUr+   FzONVLink detection failed. This is normal if your machine has no NVLink equipped.T)r   r2   nvmlDeviceGetP2PStatusNVML_P2P_CAPS_INDEX_NVLINKNVML_P2P_STATUS_OK	NVMLErrorr   	exception)rR   r  handlesr   r  jpeer_handle
p2p_statuss           r)   ra   z#NvmlCudaPlatform.is_fully_connected  s     VUATUUU"7++ 	% 	%IAv"+G"4"4 % %;q55%%+%B%B"'"=& &

 &)BBB#(555 C!+ % % %((D    %uuuu% %  ts   6A77)B&%B&c                 j    t                               |          }t                               |          S r1   )r2   r  nvmlDeviceGetName)rR   rT   r  s      r)   r  z*NvmlCudaPlatform._get_physical_device_name7  s)    229==''///r+   c                 j    t                                           }|dk    r fdt          |          D             }t          t	          |                    dk    rUt
          j                            d          dk    r4t          	                    dd
                    |                     d S d S d S d S )Nr   c                 :    g | ]}                     |          S rc   )r  )r   r   rR   s     r)   r"  z1NvmlCudaPlatform.log_warnings.<locals>.<listcomp>A  s'    XXXC99!<<XXXr+   CUDA_DEVICE_ORDER
PCI_BUS_IDzDetected different devices in the system: %s. Please make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to avoid unexpected behavior.r   )r2   nvmlDeviceGetCountr   r   setosenvirongetr   r   r   )rR   r_   device_namess   `  r)   re   zNvmlCudaPlatform.log_warnings<  s     !3355
>>XXXXeJFWFWXXXLC%%&&**JNN#677<GG1 IIl++	     > +*GGr+   r   )r   r   r   r   r   r6   r  r   rY   r   r  rG   r   r\   r  r^   r   ra   r  re   __classcell__)r  s   @r)   r	  r	    s1       
 c :JT:Q     U [   #s(Oc)  
	      [ A A AC A A A  [A 0 0 0C 0 0 0  [0
 A A AC A A A  [A
 T#Y 4     [0 0 0# 0c 0 0 0 [0    [    r+   r	  c                       e Zd Zeed
dedefd                        Zed
dedefd            Z	ed
dedefd            Z
edee         defd            Zd	S )NonNvmlCudaPlatformr   rT   r   c                 h    t           j                            |          \  }}t          ||          S r  )rH   r9   rY   r   )rR   rT   r   r  s       r)   rY   z)NonNvmlCudaPlatform.get_device_capabilityO  s0     z77	BBue59999r+   c                 @    t           j                            |          S r1   )rH   r9   r\   rX   s     r)   r\   z#NonNvmlCudaPlatform.get_device_nameU  s    z)))444r+   c                 N    t           j                            |          }|j        S r1   )rH   r9   get_device_propertiestotal_memory)rR   rT   device_propss      r)   r^   z+NonNvmlCudaPlatform.get_device_total_memoryY  s     z77	BB((r+   r  c                 :    t                               d           dS )Nz^NVLink detection not possible, as context support was not found. Assuming no NVLink available.F)r   r'  )rR   r  s     r)   ra   z&NonNvmlCudaPlatform.is_fully_connected^  s&    8	
 	
 	
 ur+   Nr   )r   r   r   r   r   r  r   rY   r   r\   r^   r   r  ra   rc   r+   r)   r:  r:  N  s        
: :c ::J : : : U [: 5 5 5C 5 5 5 [5 ) ) )C ) ) ) [) T#Y 4    [  r+   r:  T)7__doc__r4  collections.abcr   	functoolsr   r   typingr   r   r   rH   typing_extensionsr	   vllm._Cvllmvllm.loggerr
   vllm.utils.import_utilsr   vllm.utils.torch_utilsr   rq   r   	interfacer   r   r   vllm.configr   vllm.config.cacher   vllm.v1.attention.selectorr   r   r   r   r   r2   backendsr9   enable_cudnn_sdpr  r   r*   r6   r8   r	  r:  nvml_availabler3   	Exceptionr4   CudaPlatformre   rc   r+   r)   <module>rU     sK    
			 $ $ $ $ $ $ " " " " " " " " 3 3 3 3 3 3 3 3 3 3  ' ' ' ' ' '  # # # # # # 1 1 1 1 1 1 > > > > > > D D D D D D ? ? ? ? ? ? ? ? ? ? &&&&&&,,,,,,BBBBBBBJJ	X		Yt__WT]]	   $ $U + + + %%'% 

% % % %P	(2r6* 	xB/? 	 	 	 	H H H H Hx H H H^[ [ [ [ [' [ [ [|    *   6 	        $2J7J        s*   D' &E 'D1.E 0D11E E$