
    PiO                         d dl Z d dlmZ d dlmZ d dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ  ed	          Z G d
 de
          Z G d de
          Zdej        dedeee j        f         fdZdS )    N)Union)warn)nn)saved_tensors_hooks)	NF4Tensor)
TiedLinear)
get_loggerDEBUGc                   B     e Zd ZdZ	 	 	 	 ddedededed	d
f
 fdZ xZS )OffloadActivationsa  Context manager under which activation tensors created in the forward pass will be offloaded.

    Enable the memory efficiency technique of activation offloading, where activations bigger than
    min_offload_size bytes will be offloaded to CPU in the forward and brought back in the backward.
    This is in contrast to maintaining the activation on GPU VRAM throughout the program.

    This manager contains the option of using one additional CUDA stream to handle the communication
    between CUDA and CPU, which is intended to overlap with the default computation stream to improve
    runtime. We designed synchronization with a few heuristics for optimizing the tradeoff between
    runtime vs memory usage.

    Args:
        use_pin_memory (bool): Whether or not the offloaded Tensor will be placed in pinned
            memory on the CPU. Pinned memory allows the Tensor to be moved back onto GPU more quickly
            but is a limited resource. Default: True.

        use_streams (bool): Whether or not to use streams for performance optimization where
            the communications get overlapped with the computation. Requires a torch build
            after torch-2.5.0.]. Default: True.

        max_fwd_stash_size (int): The maximum size of the forward stash, or the maximum number of
            consecutive activations to keep alive during the forward pass. This number must be at
            least 1. Keeping alive more activations will potentially allow more overlap between the
            communication and compute streams at the cost of increasing memory usage. Keeping alive
            fewer activations will conserve memory, but may cause poor overlap between the streams,
            increasing runtime. Default: 5.

        min_offload_size (int): The minimum number of bytes a Tensor must be in order to qualify
            for offloading. If the tensor is too small, we do not want to waste bandwidth and resources
            moving it to CPU and back. Default: 1024 bytes.

    Raises:
        ValueError: if max_fwd_stash_size is not at least 1.

    Example:
        >>> with OffloadActivations():
        >>>     logits = model(inputs)
        >>> loss = ...
        >>> loss.backward()
    T      use_pin_memoryuse_streamsmax_fwd_stash_sizemin_offload_sizereturnNc                    	
 | _         | _        i  _        d _        d _        d _        d _        | _        d _        t          j
                                         _         j         ret          j
                                         _        i  _        |dk     rt!          d|           | _        i  _        i  _        d  _        d  _        	 fddt,          fd	dt.          f fd	d
t          j        dt.          fd
dt          j        dt.          f
 fd}dt.          dt          j        f fd}dt.          dt          j        f fd} j         r|n|}t3                                          ||           d S )Nr   T<      z/max_fwd_stash_size should be at least 1 but is c                  l                 } | j         k    rt          d| dj         d           d S d S )Nz***** WARNING: curr_pct=z!% > self.virtual_memory_safe_pct=z% of virtual memory used)virtual_memory_safe_pctr   )curr_pctget_cpu_ram_pctselfs    }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/training/_activation_offloading.py verify_sufficient_virtual_memoryzEOffloadActivations.__init__.<locals>.verify_sufficient_virtual_memoryn   s[    &((H$666lhllT5Qlll     76    r   c                  2    t          j                    j        S N)psutilvirtual_memorypercent r   r   r   z4OffloadActivations.__init__.<locals>.get_cpu_ram_pctu   s    (**22r   c                  2     xj         dz  c_          j         S )Nr   )	tensor_id)r   s   r   get_tensor_idz2OffloadActivations.__init__.<locals>.get_tensor_idy   s    NNaNN>!r   xc                 T    |                                  |                                 z  S r    )element_sizenelement)r(   s    r   get_num_bytes_tensorz9OffloadActivations.__init__.<locals>.get_num_bytes_tensor~   s#       1::<</r   
activationc                 p   j         r0t          j                  dk    s
J d            d_         d_         
|           }             }|j        k    rt          | t          j        j                  st          | t          j        j	                  sj
        rd j                                        D             D ]C}||j        z
  k    r3j        |         \  }}j                            |           j        |= C j                            j                   j
        rj        nj        }t          j                            |          5  	 t          j        | j        d          }nI# t,          $ r<}t          | t.                    r t0          j        dk     rt5          d	          ||d }~ww xY w|                    | d
           |dfj        |<   d d d            n# 1 swxY w Y   j
        r%j                                        }	| |	fj        |<   n| dfj        |<   |S )Nr   z8backward pass should have cleared tracker of all tensorsFTc                     g | ]}|S r$   r$   .0ks     r   
<listcomp>zDOffloadActivations.__init__.<locals>.pack_tensor.<locals>.<listcomp>       @@@Qq@@@r   cpu)
pin_memorydevicez0.6.0.dev20240917zAOffloading NF4Tensors requires torchao-0.6.0.dev20240917 or laternon_blocking)is_first_forward_calllentrackeris_first_backward_callmin_tensor_size_bytes
isinstancetorchr   	ParameterBufferr   	fwd_stashkeysr   s0
wait_events1wait_streamcudastream
empty_liker   NotImplementedErrorr   torchao__version__RuntimeErrorcopy_record_event)r-   	num_bytesr&   id_evrJ   
cpu_tensoreeventr,   r'   r   s             r   pack_tensorz0OffloadActivations.__init__.<locals>.pack_tensor   s   ) 3%%***M +** .3*.2+ -,Z88I%I D666z58+=>> 7":ux?? 7 # 1 A@$.*=*=*?*?@@@ " "T-D!DDD$(N2$6EAr G..r222 $r 2 2! G''000$($4A$'Z&&v..   %*%5&43Fu& & &

 /      &z9==% ' 36I I I". c# ##$%    $$Zd$CCC"/DL+              ( # D G0022E 2<U0CDN9- +Y'
 s6   G2E:9G2:
G 7F;;G  &G22G69G6unpack_tensor_idc                    j         r-j        rd_        j        r
              d_         d_        | j        v sJ d|              j        |          \  }}|r|                    dd          }|}j        | = |S )NFTuntracked tensor with id rI   r8   )r=   is_first_forward_passr   r:   r<   to)rZ   maybe_gpu_tensormodified
gpu_tensorr   r   s       r   unpack_tensor_single_streamz@OffloadActivations.__init__.<locals>.unpack_tensor_single_stream   s     * 2- ;16D.* ;88:::.3+-1* !DL000=+;== 100 *.6F)G&h .-00d0KK
#-  -.##r   c                 j   	
 j         rt          j                                        _        dfd}t          j        j        j        j        	                    |           j
        rd_
        j        r
              d_         d_         j        v sJ d              j                  \  }}|rqt          j                                        }t          j                                        }g 	|j        k    r5j        |k    r*|_        d j                                        D             	d j        v rj                  d         }dnt          j                            j                  5  |                    dd	          }|}d d d            n# 1 swxY w Y   j                            j                   |j         <   t          j                            |                                j                  
	
 fd
}|                    |           j         = |S )Nr   c                      d j                                         D             D ]1} j        |          }j                            |           j         | = 2d S )Nc                     g | ]}|S r$   r$   r0   s     r   r3   z~OffloadActivations.__init__.<locals>.unpack_tensor_with_streams.<locals>.wait_and_del_remaining_references.<locals>.<listcomp>   s    GGGQqGGGr   )bwd_tensor_stashrD   bwd_ev_stashrG   rF   )rS   rX   r   s     r   !wait_and_del_remaining_referenceszjOffloadActivations.__init__.<locals>.unpack_tensor_with_streams.<locals>.wait_and_del_remaining_references   sj    GG$*?*D*D*F*FGGG 6 6 $ 1" 5**5111 1"556 6r   FTr\   c                     g | ]}|S r$   r$   )r1   rS   s     r   r3   zSOffloadActivations.__init__.<locals>.unpack_tensor_with_streams.<locals>.<listcomp>
  s    $O$O$OBR$O$O$Or   r   rI   r8   c                 >   r	j                  }t          j                            |                                j                  
k    r#|                    	j                   	j         = n#	j                                        }|	j	        <   d 	j
                                        D             D ]4}	j
        |         \  }}	j                            |           	j
        |= 5D ]1}	j	        |         }	j                            |           	j         |= 2| S )Nc                     g | ]}|S r$   r$   r0   s     r   r3   zaOffloadActivations.__init__.<locals>.unpack_tensor_with_streams.<locals>.hook.<locals>.<listcomp>I  r4   r   )rf   r@   _C_storage_Use_Countuntyped_storage_cdatarecord_streamrE   rQ   rg   rC   rD   rF   rG   )outputsinputsunpacked_tensorrX   rS   rT   rU   brought_back_from_cpuprev_node_idsr   storage_refcountrZ   s          r   hookzMOffloadActivations.__init__.<locals>.unpack_tensor_with_streams.<locals>.hook-  sB   , H +/*?@P*Q!H77 / ? ? A A H  // /
 ,99$'BBB $ 56F G G$(G$8$8$:$:EBGD-.>? A@$.*=*=*?*?@@@ / / $r 22**2... N2.. , 6 6 $ 1" 5**5111 1"55"Nr   r   N)r=   r@   rl   _current_graph_task_idcurr_graph_idautogradvariableVariable_execution_enginequeue_callbackr]   r   r:   r<   _current_autograd_nodecurr_autograd_noderf   rD   rC   rI   rJ   rG   r^   rE   rH   rm   rn   ro   register_hook)rZ   rh   r_   r`   graph_idnodera   rw   rt   ru   rv   r   r   s   `       @@@r   unpack_tensor_with_streamsz?OffloadActivations.__init__.<locals>.unpack_tensor_with_streams   s    * 2%*X%D%D%F%F"6 6 6 6 6 6 '0BQQ5   - ;16D.* ;88:::.3+-1* !DL000=+;== 100 *.6F)G&h U) 8::<<x6688 " t111d6MQU6U6U.2D+$O$O$2G2L2L2N2N$O$O$OM(,%#t~55'+~6F'G'J$,1)) **4733 6 6%5%8%8d%8%S%S
+5(6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 G''000 ?OD)*:; (-x'B'B(88::A( ($'# '# '# '# '# '# '# '# '#R ""4((( -.##s   =F##F'*F')r   r>   r<   r&   r:   r=   r]   r   r   r@   rI   default_streamrE   StreamrG   rC   
ValueErrorr   rf   rg   rz   r   floatintTensorsuper__init__)r   r   r   r   r   rY   rb   r   unpack_tensorr   r,   r'   r   	__class__s   `        @@@@r   r   zOffloadActivations.__init__B   sf    "-  	"  	  %)"&*#%)" %3 	$ *++--  	+j''))DGDN!A%% ZFXZZ   '9D#$&D! "D!%D&*D#	 	 	 	 	 		3 	3 	3 	3 	3	"s 	" 	" 	" 	" 	" 	"
	EL 	S 	 	 	 	C	EL C	S C	 C	 C	 C	 C	 C	 C	 C	J	$# 	$%, 	$ 	$ 	$ 	$ 	$ 	$ 	$2w	$ w	$ w	$ w	$ w	$ w	$ w	$ w	$ w	$v -&&, 	
 	m44444r   )TTr   r   )__name__
__module____qualname____doc__boolr   r   __classcell__r   s   @r   r   r      s        ' 'V  $ "# $_5 _5_5 _5  	_5
 _5 
_5 _5 _5 _5 _5 _5 _5 _5 _5 _5r   r   c                   $     e Zd ZdZd fdZ xZS )NoOpManagerae  
    A saved_tensors_hook manager used to disable any other saved_tensors_hook manager
    applied before. This relies on the behavior that only the most recently registered
    saved_tensors_hook will run.

    One example usage is to opt a local region of code out of activations offloading,
    which is usually applied globally to best track state.
    r   Nc                 R    d }t                                          ||           d S )Nc                     | S r    r$   )tensors    r   noopz"NoOpManager.__init__.<locals>.noopo  s    Mr   )r   r   )r   r   r   s     r   r   zNoOpManager.__init__n  s4    	 	 	 	t$$$$$r   rx   )r   r   r   r   r   r   r   s   @r   r   r   d  sG         % % % % % % % % % %r   r   modelenable_activation_offloadingr   c                    |r+t                      }d}t                      t          | d          rt          | j        t
          j                  r?| j                            fd           | j                            fdd           d}nt          | j        t                    rH| j        j
                            fd           | j        j
                            fdd           d}nt          | d	          rt          d
          |st                              d           nt          j                    }|S )a  Returns the activation offloading context manager for the model, which will be
    a null context if enable_activation_offloading is False.

    If activation offloading is enabled, we return the OffloadActivations context manager.
    If activation offloading is disabled, we return a NoOpManager context manager.

    Args:
        model (nn.Module): the model to wrap with the activation offloading context manager.
        enable_activation_offloading (bool): whether or not to enable activation offloading
            for the model.

    Returns:
        contextlib.ContextDecorator: the activation offloading context manager for the model.

    Raises:
        NotImplementedError: If the model is a multimodal model and activation offloading is enabled.
    Foutputc                  ,                                     S r    	__enter__argsnoop_ctxs    r   <lambda>z0get_act_offloading_ctx_manager.<locals>.<lambda>      ("4"4"6"6 r   c                  ,                                     S r    __exit__r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>      ("3"3"5"5 r   T)always_callc                  ,                                     S r    r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  r   r   c                  ,                                     S r    r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  r   r   decoderzjMultimodal model does not support activation offloading yet. Please set enable_activation_offloading=Falsea  During activation offloading, no output head was detected. If your model has an output head, it will be offloaded. This usually greatly slows training, given the large vocabulary size. To change this behavior, set your output head as model.output and make it an nn.Module.)r   r   hasattrr?   r   r   Moduleregister_forward_pre_hookregister_forward_hookr   linearrL   logwarning
contextlibnullcontext)r   r   activations_handling_ctxoutput_head_detectedr   s       @r   get_act_offloading_ctx_managerr   u  s   ( $ 6<#5#7#7   %==5(## 	%,	22 ,666666   2255554 3    (,$$EL*55 ,#==6666   #9955554 :    (,$UI&& 	
 &|   $ 	KK    $.#9#;#; ##r   )r   typingr   warningsr   r!   r@   rM   r   torch.autograd.graphr   torchao.dtypes.nf4tensorr   torchtune.modulesr   torchtune.utilsr	   r   r   r   r   r   r   r   r$   r   r   <module>r      si                            4 4 4 4 4 4 . . . . . . ( ( ( ( ( ( & & & & & &jI5 I5 I5 I5 I5, I5 I5 I5X
% % % % %% % % %"L$9L$48L$
z556L$ L$ L$ L$ L$ L$r   