
    -`i-                     0   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	Z	d dl
mZ d dlmZ d dlmZ  ee          ZdZ	 d d	lmZmZmZ d d
lmZ  ed          Z e            ZdZn# e$ r dZdZdZdZdZdZY nw xY weeeeef         Ze j         G d d                      Z deddfdZ!deddfdZ"deegef         deeegdf         de	j#        j$        j%        fdZ&edeegef         deeegdf         ddfd            Z' G d d          Z(dS )    N)Callable)contextmanager)Any)init_logger)is_pin_memory_available)find_loaded_libraryF)init_modulepython_create_and_mappython_unmap_and_release)CudaRTLibrarycumem_allocatorTc                   B    e Zd ZU eed<   eed<   dZej        dz  ed<   dS )AllocationDatahandletagNcpu_backup_tensor)	__name__
__module____qualname__
HandleType__annotations__strr   torchTensor     o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/device_allocator/cumem.pyr   r   4   s?         	HHH-1u|d*11111r   r   allocation_handlereturnc                     t          |   d S N)r
   r   s    r   create_and_mapr#   ;   s    ,----r   c                     t          |   d S r!   )r   r"   s    r   unmap_and_releaser%   ?   s    /0000r   python_malloc_fnpython_free_funcc                 |    t          | |           t          j        j                            t
          dd          }|S )N	my_mallocmy_free)r	   r   cudamemoryCUDAPluggableAllocatorlib_name)r&   r'   	new_allocs      r   get_pluggable_allocatorr0   C   s?      "2333
!88+y I r   c              #     K   t          | |          }t          j        j                            |j                  }t          j        j                            |          5  ||fV  d d d            d S # 1 swxY w Y   d S r!   )r0   r   r+   r,   MemPool
_allocatoruse_mem_pool)r&   r'   r/   mem_pools       r   use_memory_pool_with_allocatorr6   M   s       ((8:JKKIz (()=>>H			'	'	1	1 " "	!!!!" " " " " " " " " " " " " " " " " "s    A44A8;A8c                       e Zd ZU dZdZd ed<   dZeed<   edd            Z	d Z
d	eddfd
ZdedefdZddeedf         ez  dz  ddfdZddee         dz  ddfdZeddedz  fd            ZdefdZdS )CuMemAllocatora'  
    A singleton class that manages a memory pool for CUDA tensors.
    The memory in this pool can be offloaded or discarded when the
    allocator sleeps.

    Inside the `use_memory_pool(tag)` context, all tensors created will
    be allocated in the memory pool, and has the same tag as the
    tag passed to the context.

    When we call `sleep`, all tensors with the specified tag will be
    offloaded to CPU memory, and the rest of the tensors will be discarded.
    When we call `wake_up`, all tensors that are previously offloaded
    will be loaded back to GPU memory, and the rest of the tensors will
    have empty memory.

    Why it needs to be a singleton?
    When allocated tensors are garbage collected, PyTorch will call
    the free callback, which will call the `python_free_callback` method.
    The C-extension uses a global variable to store the function of an
    instance of this class. If we create multiple instances of this class,
    the global variable will be overwritten and the free callback will
    not work as expected.
    Ninstancedefaultdefault_tagr   c                      t           s
J d            t          j        t                      t          _        t          j        S )z
        CuMemAllocator is a singleton class.
        We cannot call the constructor directly.
        Call this method to get the instance.
        z cumem allocator is not available)cumem_availabler8   r9   r   r   r   get_instancezCuMemAllocator.get_instances   s8     BB BBB"*&4&6&6N#&&r   c                     t           j                            dd          }d|vs
J d            i | _        t          j        | _        i | _        | j        | _	        | j
        | _        d S )NPYTORCH_CUDA_ALLOC_CONF zexpandable_segments:TruezExpandable segments are not compatible with memory pool. Please track https://github.com/pytorch/pytorch/issues/147851 for the latest updates.)osenvirongetpointer_to_datar8   r;   current_tagallocator_and_pools_python_malloc_callbackpython_malloc_callback_python_free_callbackpython_free_callback)selfconfs     r   __init__zCuMemAllocator.__init__   sq    z~~7<<)555& 655 ;= . :35  '+&B#$($>!!!r   r   c                     |d         }t          || j                  | j        |<   t                              d|d         | j        |           dS )zj
        Internal method to store the allocation data
        when memory is allocated in the memory pool.   z>Allocated %s bytes for %s with address %s from cumem allocator   N)r   rF   rE   loggerdebug)rL   r   py_d_mems      r   rH   z&CuMemAllocator._python_malloc_callback   s`     %Q')7t/*
 *
X& 	La 		
 	
 	
 	r   ptrc                     | j                             |          }|j        d|_        t                              d|j        d         |j        |           |j        S )zh
        Internal method to look up the allocation data
        when memory is freed in the memory pool.Nz:Freed %s bytes for %s with address %s from cumem allocatorrQ   )rE   popr   rR   rS   r   r   )rL   rU   datas      r   rJ   z$CuMemAllocator._python_free_callback   s_     #'',,!-%)D"HKNH		
 	
 	
 {r   offload_tags.c                    |t           j        f}nt          |t                    r|f}t          |t                    sJ d}d}| j                                        D ]\  }}|j        }||d         z  }|j        |v rx||d         z  }|d         }t          j
        |t          j        dt                                }|                                }	t                              |	||           ||_        t#          |           t$                              d|dz  |dz  ||z
  dz             t)          j                     t          j                                         dS )aS  
        Put the allocator in sleep mode.
        All data in the memory allocation with the specified tag will be
        offloaded to CPU memory, and others will be discarded.

        :param offload_tags: The tags of the memory allocation that will be
            offloaded. The rest of the memory allocation will be discarded.
        Nr   rQ   cpu)dtypedevice
pin_memoryzCuMemAllocator: sleep freed %.2f GiB memory in total, of which %.2f GiB is backed up in CPU and the rest %.2f GiB is discarded directly.i   @)r8   r;   
isinstancer   tuplerE   itemsr   r   r   emptyuint8r   data_ptr	libcudart
cudaMemcpyr   r%   rR   infogccollectr+   empty_cache)
rL   rY   total_bytesbackup_bytesrU   rX   r   size_in_bytesr   cpu_ptrs
             r   sleepzCuMemAllocator.sleep   s}     +68LLc** 	+(?L,.....-3355 	& 	&IC[F6!9$Kx<''q	) &q	$)K!+ 688	% % %! ,4466$$Wc=AAA):&f%%%% '!7"<'72	
 	
 	
 	

     r   tagsc                 f   | j                                         D ]\  }}|	|j        |v r|j        }t	          |           |j        i|j        }|`|                                |                                z  }|                                }t          
                    |||           d|_        dS )a|  
        Wake up the allocator from sleep mode.
        All data that is previously offloaded will be loaded back to GPU
        memory, and the rest of the data will have empty memory.

        :param tags: The tags of the memory allocation that will be loaded
            back to GPU memory. If None, all memory allocation will be loaded
            back to GPU memory.
        N)rE   ra   r   r   r#   r   numelelement_sizerd   re   rf   )rL   rp   rU   rX   r   r   rm   rn   s           r   wake_upzCuMemAllocator.wake_up   s     -3355 	6 	6IC|tx4//v&&&)5(,(>%(4-33558I8V8V8X8XX & #4"<"<">">!,,S'=III15.	6 	6r   r   c              #     K   |t           j        }t          |t                    sJ | j        }|| _        t          | j        | j                  5 }|| j        |<   dV  |d         	                                }|D ]8}|d         dk    r*| 
                    |d                   }t          |           9|| _        ddd           dS # 1 swxY w Y   dS )a'  
        A context manager to use the memory pool.
        All memory allocation created inside the context will be allocated
        in the memory pool, and has the specified tag.

        :param tag: The tag of the memory allocation. If None, the default tag
            will be used.
        Nr   allocated_sizeaddress)r8   r;   r_   r   rF   r6   rI   rK   rG   snapshotrJ   r%   )rL   r   old_tagrX   allocations
allocationr   s          r   use_memory_poolzCuMemAllocator.use_memory_pool   s6      ; ,C#s#####"+')B
 
 	' -1D$S)EEE q'**,,K) . .
./144!77
98MNNF%f---&D3	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	's   A+CCCc                 l    d}| j                                         D ]\  }}|j        }||d         z  }|S )zM
        Get the total number of bytes allocated in the memory pool.
        r   rQ   )rE   ra   r   )rL   	sum_bytesrU   rX   r   s        r   get_current_usagez CuMemAllocator.get_current_usage%  sH     	-3355 	# 	#IC[F"IIr   )r   r8   r!   )r   r   r   __doc__r9   r   r;   r   staticmethodr>   rN   r   rH   intrJ   r`   ro   listrt   r   r|   r   r   r   r   r8   r8   W   sj         0 "&H%%% K   	' 	' 	' \	'? ? ?"           0! 0!%S/C"7$"> 0!$ 0! 0! 0! 0!d6 6DI, 6 6 6 6 60 )' )'3: )' )' )' ^)'V3      r   r8   ))dataclassesrh   rB   collections.abcr   
contextlibr   typingr   r   vllm.loggerr   vllm.utils.platform_utilsr   vllm.utils.system_utilsr   r   rR   r=   vllm.cumem_allocatorr	   r
   r   2vllm.distributed.device_communicators.cuda_wrapperr   r.   re   ModuleNotFoundErrorr`   r   r   	dataclassr   r#   r%   r+   r,   r-   r0   r6   r8   r   r   r   <module>r      s       				 				 $ $ $ $ $ $ % % % % % %        # # # # # # = = = = = = 7 7 7 7 7 7	X		          
 QPPPPP""#455HIOO   K #MHIII 3S#%&
 2 2 2 2 2 2 2 2.j .T . . . .1 1 1 1 1 1ucz*>FSzSWGW>X
Z-    "ucz*">FSzSWGW>X"	" " " "V V V V V V V V V Vs   'A+ +A?>A?