
    Pi;                     R    d dl mZ d dlZd dlmZmZ d dlmZ  G d de          ZdS )    )TypeN)	OptimizerParamsT)get_available_devicesc                       e Zd Zej        j        fddddedee         de	de
dd	f
d
Z ej                    dd            ZddZed             Zd Zd Zd	S )CPUOffloadOptimizerFi   )offload_gradientsminimal_sizeparamsoptimizer_classr	   r
   returnNc                     |t           j        j        u rd|vr|                    d           t	          |          }t          |          dk    rt          d          t          |d         t                    sd|ig}| _	        d _
        g  _        t                       _        t                       _        t                      d          _         j        d	v s
J d
            t!          t            j                                                   _        t                       _         fd}|D ]}|                    d          }g }	|D ]}
|
j        s
|
                                 j	        k     r|	                    |
           =t          j        |
dd          }t          j        |d          |_        |                    |
                                d           | j        |
<   |
                    |            |d|i|gfi | j        |
<   t          |	          dk    r j                            d|	i|           t           j                  dk    r | j        fi | _
        dS dS )a  Offload optimizer to CPU for single-GPU training. This will reduce GPU memory by the size of optimizer state.
        Optimizer step will be done on CPU.

        Args
            params: a list of parameters or parameter groups.
            optimizer_class: constructor of the base optimizer. Defaults to :class:`torch.optim.AdamW`.
            offload_gradients: free GPU gradients once they are moved to CPU. Not compatible with gradient accumulation.
            minimal_size: tensors smaller than this are kept on the GPU, to avoid excessively many small transfers.
            kwargs: other keyword arguments to be passed to the base optimizer e.g. `lr`, `weight_decay`.
        fusedT)r   r   z%optimizer got an empty parameter listr   N)cudaxpuz.CPU Offload currently only supports CUDA & XPUc                 L   | j         j        |          }j                            t	          t
          j                                                             t	          t
          j                                      j                  5  |j                             | j         d           d d d            n# 1 swxY w Y   | j	        v rj	        | = j        
                                j	        | <   r*| j                             j                   d | _         d S d S d S NTnon_blocking)gradparam_d2h_mapstreamwait_streamgetattrtorchdevicecurrent_streamcopy_queuerecord_eventrecord_stream)p_devicep_hostr	   selfs     m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/optim/cpu_offload.pybackward_hookz3CPUOffloadOptimizer.__init__.<locals>.backward_hookA   s_   }(+H5 ''t{(C(C(R(R(T(TUUUUDK0077DD H HK%%hm$%GGGH H H H H H H H H H H H H H H tz))
8,'+{'?'?'A'A
8$ % )M//<<<$(HMMM% )( ) )s   "B;;B?B?cpu)r   
pin_memory)r)   r   )r   optimAdamWupdatelistlen
ValueError
isinstancedictr
   d_optd_param_groupsr   
optim_dictr   r   r   Streamr   r    poprequires_gradnumelappend
empty_liker   r   detach"register_post_accumulate_grad_hook)r%   r   r   r	   r
   kwargsparam_groupsr'   param_groupretained_paramsr#   r$   s   `  `        r&   __init__zCPUOffloadOptimizer.__init__   s   ( ek///G64I4IMMM%%%F|||!!DEEE,q/400 	6%|45L )
 !VV&&+--b1{ 
 
 
 
 <
 
 
 eT[1188:: VV
	) 	) 	) 	) 	) 	)* ( 	W 	WK __X..F O"  - >>##d&777#**8444 )(5TRRR#.v$GGGX__..TBBB/5"8,;;MJJJ,;O6+67- -;A- -)) ?##a''#**Ho+U+UVVVt"##a''()<GGGGDJJJ ('    c                 F   d }|
 |            }| j         | j                                          | j                                        D ]\  }}|                                 | j        |                                          | j        |         }t          t          | j	                  
                    | j
                  5  |                    |d           d d d            n# 1 swxY w Y   | j
                                         | j                                         |S r   )r2   stepr    itemssynchronizer4   r   r   r   r   r   r   clear)r%   closurelossr#   grad_d2h_eventr$   s         r&   rD   zCPUOffloadOptimizer.stept   sM   799D :!JOO(,
(8(8(:(: 		: 		:$Hn&&(((OH%**,,,
 '1F,,33DK@@ : :vD999: : : : : : : : : : : : : : : 	!!!
s   ?C##C'	*C'	Tc                     |sJ | j                                         D ]	}d |_        
| j        | j                            |           d S d S )N)set_to_none)r   keysr   r2   	zero_grad)r%   rL   r#   s      r&   rN   zCPUOffloadOptimizer.zero_grad   se    { *//11 	! 	!H HMM:!J  [ 99999 "!rB   c                 p    t          d | j                                        D             | j                  S )Nc              3   $   K   | ]}|j         V  d S N)r>   .0r*   s     r&   	<genexpr>z3CPUOffloadOptimizer.param_groups.<locals>.<genexpr>   s%      FFEUFFFFFFrB   )start)sumr4   valuesr3   )r%   s    r&   r>   z CPUOffloadOptimizer.param_groups   s@     FFT_-C-C-E-EFFF%
 
 
 	
rB   c                     dd | j                                         D             i}| j        r| j                                        |d<   |S )N	offloadedc                 6    g | ]}|                                 S  )
state_dictrR   s     r&   
<listcomp>z2CPUOffloadOptimizer.state_dict.<locals>.<listcomp>   s$    SSS%**,,SSSrB   	on-device)r4   rW   r2   r\   )r%   r\   s     r&   r\   zCPUOffloadOptimizer.state_dict   sU    SS$/:P:P:R:RSSS

 : 	>&*j&;&;&=&=J{#rB   c                    t          | j                                        |d                   D ]\  }}|                    |           | j        r"| j                            |d                    d S d|v rt          d          d S )NrY   r^   zPloaded state dict has a 'on-device' parameter group not present in the optimizer)zipr4   rW   load_state_dictr2   r/   )r%   r\   r*   optim_state_dicts       r&   ra   z#CPUOffloadOptimizer.load_state_dict   s    '*O""$$j&=(
 (
 	4 	4#E# !!"23333: 	J&&z+'>?????J&&b   '&rB   rQ   )T)__name__
__module____qualname__r   r*   r+   r   r   r   boolintrA   no_gradrD   rN   propertyr>   r\   ra   r[   rB   r&   r   r      s        ,1;+<_H
 #( _H _H _H_H i_H
  _H _H 
_H _H _H _HB U]__   _2: : : : 
 
 X
      rB   r   )	typingr   r   torch.optim.optimizerr   r   torchao.utilsr   r   r[   rB   r&   <module>rm      s           4 4 4 4 4 4 4 4 / / / / / /b b b b b) b b b b brB   