§
    ÇPƒi%C  ã                   ó|  — U d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlmZ d dlZd dlmZ d dlmZmZ d dlmZ dZeed	<   ej        d
dœdej        de	eef         dedej        dedefd„Zdededee         fd„Z ej        d
dœdej        de	eef         dedej        dedefd„Z!ej        d
fdej        dej        defd„Z"e j#        dej        de
d         fd„¦   «         Z$e j#        dddœdej        dedej%        dej        dee         d ee         de
d         fd!„¦   «         Z&dej        fd"„Z'dS )#é    N)ÚOrderedDict)Úpartial)ÚAnyÚDictÚ	GeneratorÚOptional)Úwarn)ÚFakeTensorConverterÚFakeTensorMode)Ú	NF4TensorFÚ_use_low_cpu_ramT©ÚdtypeÚoffload_to_cpuÚmodelÚ
state_dictÚargsr   r   Úkwargsc                óÒ   — |                      ¦   «         D ]Q\  }}t          |t          ¦  «        r7|                     |¦  «        ||<   |r||                              ¦   «         ||<   ŒRdS )aì  
    A state_dict hook that replaces NF4 tensors with their restored
    higher-precision weight and optionally offloads the restored weight to CPU.
    Use this hook to avoid increased peak GPU memory usage during checkpoint
    save when training with QLoRA.

    This function is meant to be used with PyTorch's ``nn.Module._register_state_dict_hook``, i.e.

    >>> m = MyModule()
    >>> m._register_state_dict_hook(reparametrize_as_dtype_state_dict_post_hook)

    If the hook is registered per the above process, this hook will be called _after_ the module's
    ``state_dict`` method is called. The hook will replace all ``NF4Tensor`` instances by unquantizing
    them to the original dtype, and optionally offload the restored weight to CPU.

    Args:
        model (nn.Module): the model to take ``state_dict()`` on
        state_dict (Dict[str, Any]): the state dict to modify
        *args (Any): Unused args passed when running this as a state_dict hook.
        dtype (torch.dtype): the dtype to restore the weight to. Default is ``torch.bfloat16``.
        offload_to_cpu (bool): whether to offload the restored weight to CPU. Default is ``True``.
        **kwargs (Any): Unused keyword args passed when running this as a state_dict hook.
    N)ÚitemsÚ
isinstancer   ÚtoÚcpu)r   r   r   r   r   r   ÚkÚvs           úr/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/modules/common_utils.pyÚ+reparametrize_as_dtype_state_dict_post_hookr      st   € ð> × Ò Ñ"Ô"ð 4ð 4‰ˆˆ1ÝaÑ#Ô#ð 	4ØŸDšD ™KœKˆJq‰MØð 4Ø *¨1¤× 1Ò 1Ñ 3Ô 3
˜1‘øð	4ð 4ó    Ú	slice_strÚlengthÚreturnc                 ó~  ‡— d| vsd| vs
J d¦   «         ‚d| v rVd„ |                       d¦  «        D ¦   «         }t          ˆfd„|D ¦   «         ¦  «        s
J d¦   «         ‚dg‰z  }|D ]}d||<   Œ|S |                       d¦  «        }t          |¦  «        d	k    s
J d
¦   «         ‚d\  }}}t          |¦  «        dk    r)|d         dk    rt          |d         ¦  «        }|dz   }d}nÖt          |¦  «        dk    rG|d         dk    rt          |d         ¦  «        nd}|d         dk    rt          |d         ¦  «        nd}n|t          |¦  «        d	k    ri|d         dk    rt          |d         ¦  «        nd}|d         dk    rt          |d         ¦  «        nd}|d         dk    rt          |d         ¦  «        nd}|d|cxk    r‰k     sn J d¦   «         ‚|d|cxk    r‰k     sn J d¦   «         ‚||dk    s
J d¦   «         ‚dg‰z  }t	          ||nd||n‰||nd¦  «        }	|	D ]}d|cxk    r‰k     rn Œd||<   Œ|S )af  
    Convert a string representing a Python slice or index into a boolean array.

    The resulting array will have the same length as the specified `length` parameter.
    Each element in the array corresponds to an index in the original sequence,
    with `True` indicating that the index is included in the slice and `False` otherwise.

    Args:
        slice_str (str): A string representing a Python slice or index, e.g. "1:3", ":5", "2::3", "0,4,5".
        length (int): The length of the original sequence.

    Returns:
        list[bool]: A boolean array representing the slice.

    Examples:
        >>> slice_str_to_array("1:3", 5)
        [False, True, True, False, False]
        >>> slice_str_to_array(":", 5)
        [True, True, True, True, True]
        >>> slice_str_to_array("::2", 5)
        [True, False, True, False, True]
        >>> slice_str_to_array("1::2", 5)
        [False, True, False, True, False]
        >>> slice_str_to_array("2:5:2", 6)
        [False, False, True, False, True, False]
        >>> slice_str_to_array("0,4,5", 7)
        [True, False, False, False, True, True, False]
    ú,ú:zCannot mix commas and colonsc                 ó,   — g | ]}t          |¦  «        ‘ŒS © )Úint)Ú.0Úis     r   ú
<listcomp>z&slice_str_to_array.<locals>.<listcomp>_   s   € Ð8Ð8Ð8˜a•3q‘6”6Ð8Ð8Ð8r   c              3   ó8   •K  — | ]}d |cxk    o‰k     nc V — ŒdS )r   Nr&   )r(   r)   r    s     €r   ú	<genexpr>z%slice_str_to_array.<locals>.<genexpr>`   s7   øè è € Ð4Ð4 q1˜??’??˜F’????Ð4Ð4Ð4Ð4Ð4Ð4r   zIndex out of rangeFTé   zInvalid slice format©NNNé   r   Ú é   NzStart index out of rangezEnd index out of rangezStep cannot be zero)ÚsplitÚallÚlenr'   Úrange)
r   r    ÚindicesÚresultr)   ÚpartsÚstartÚendÚstepÚslice_indicess
    `        r   Úslice_str_to_arrayr=   >   sÐ  ø€ ð< iÐÐ 3¨iÐ#7Ð#7Ð#7Ð9WÑ#7Ô#7Ð7à
ˆiÐÐØ8Ð8 9§?¢?°3Ñ#7Ô#7Ð8Ñ8Ô8ˆÝÐ4Ð4Ð4Ð4¨GÐ4Ñ4Ô4Ñ4Ô4ÐJÐJÐ6JÑJÔJÐ4Ø˜6Ñ!ˆØð 	ð 	ˆAØˆF1‰IˆIØˆàOŠO˜CÑ Ô €EÝˆu‰:Œ:˜Š?ˆ?ˆ?Ð2‰?Œ?ˆ?Ø'Ñ€Eˆ3å
ˆ5z„zQ‚€˜5 œ8 rš>˜>ÝE˜!”H‘”ˆØa‰iˆØˆˆÝ	ˆU‰ŒqŠˆØ!& q¤¨R¢ •E˜!”H‘”°TˆØ$ Qœx¨2š~˜~c%˜”(‰mŒmˆm°4ˆˆÝ	ˆU‰ŒqŠˆØ!& q¤¨R¢ •E˜!”H‘”°TˆØ$ Qœx¨2š~˜~c%˜”(‰mŒmˆm°4ˆØ % a¤¨B¢ s5˜”8‰}Œ}ˆ}°Dˆàˆ=˜A Ð/Ð/Ò/Ð/¨Ò/Ð/Ð/Ð/Ð/Ð1KÑ/Ô/Ð/Øˆ;˜!˜sÐ+Ð+Ò+Ð+ VÒ+Ð+Ð+Ð+Ð+Ð-EÑ+Ô+Ð+Øˆ<˜4 1š9˜9˜9Ð&;™9œ9Ð$àˆWvÑ€FÝØÐ"ˆˆ¨Øˆˆˆ FØÐ ˆˆ añô €Mð ð ð ˆØˆ?ˆ?Š?ˆ?FŠ?ˆ?ˆ?ˆ?ˆ?ØˆF1‰Iøà€Mr   c                ó$  — t          ¦   «         }t          ¦   «         }t          ¦   «         }|                     ¦   «         D ]\  }	}
t	          |
t
          ¦  «        r-|                     ||
¦  «                             |¦  «        ||	<   n|                     ||
¦  «        ||	<   |r||	                              ¦   «         ||	<   Œ€d}t          j
                             d¬¦  «        5  t          j        ||¦  «         ddd¦  «         n# 1 swxY w Y   t          j
                             t          j        ¦  «        5  t          j        |dd¬¦  «        }ddd¦  «         n# 1 swxY w Y   |                     ¦   «         D ]s}	t	          ||	         t
          ¦  «        r5||	                              ||	                              |¦  «        ¦  «         ŒR||	                              ||	         ¦  «         Œt|                     ¦   «         D ]}	||	         ||	<   ŒdS )ax  
    A state_dict hook that replaces NF4 tensors with their restored
    higher-precision weight and optionally offloads the restored weight to CPU.
    Use this hook to avoid increased peak GPU memory usage during checkpoint
    save when training with QLoRA.

    This hook is similar to ``reparametrize_as_dtype_state_dict_post_hook`` but uses
    FakeTensor and mmap(2) to avoid CPU OOM on colab.

    This function is meant to be used with PyTorch's ``nn.Module._register_state_dict_hook``, i.e.

    >>> m = MyModule()
    >>> m._register_state_dict_hook(reparametrize_as_dtype_state_dict_post_hook)

    If the hook is registered per the above process, this hook will be called _after_ the module's
    ``state_dict`` method is called. The hook will replace all ``NF4Tensor`` instances by unquantizing
    them to the original dtype, and optionally offload the restored weight to CPU.

    Args:
        model (nn.Module): the model to take ``state_dict()`` on
        state_dict (Dict[str, Any]): the state dict to modify
        *args (Any): Unused args passed when running this as a state_dict hook.
        dtype (torch.dtype): the dtype to restore the weight to. Default is ``torch.bfloat16``.
        offload_to_cpu (bool): whether to offload the restored weight to CPU. Default is ``True``.
        **kwargs (Any): Unused keyword args passed when running this as a state_dict hook.
    z/tmp/fake_state_dict.ptT)Úmaterialize_fake_tensorsN)ÚmmapÚweights_only)r   r
   r   r   r   r   Úfrom_real_tensorr   r   ÚtorchÚserializationÚ	skip_dataÚsaveÚset_default_mmap_optionsr@   Ú
MAP_SHAREDÚloadÚkeysÚcopy_)r   r   r   r   r   r   ÚmodeÚ	converterÚfake_state_dictr   r   Údest_state_dict_pathÚdest_state_dicts                r   Ú4_low_ram_reparametrize_as_dtype_state_dict_post_hookrQ   ˆ   s”  € õF ÑÔ€DÝ#Ñ%Ô%€IÝ!‘m”m€OØ× Ò Ñ"Ô"ð :ð :‰ˆˆ1ÝaÑ#Ô#ð 	EØ!*×!;Ò!;¸DÀ!Ñ!DÔ!D×!GÒ!GÈÑ!NÔ!NˆO˜AÑÐà!*×!;Ò!;¸DÀ!Ñ!DÔ!DˆO˜AÑàð 	:Ø!0°Ô!3×!7Ò!7Ñ!9Ô!9ˆO˜AÑøð 5ÐÝ	Ô	×	&Ò	&ÀÐ	&Ñ	EÔ	Eð :ð :ÝŒ
?Ð$8Ñ9Ô9Ð9ð:ð :ð :ñ :ô :ð :ð :ð :ð :ð :ð :øøøð :ð :ð :ð :å	Ô	×	5Ò	5µd´oÑ	FÔ	Fð Yð YÝœ*Ð%9ÀÐSWÐXÑXÔXˆðYð Yð Yñ Yô Yð Yð Yð Yð Yð Yð Yøøøð Yð Yð Yð Yð
 _Š_ÑÔð 4ð 4ˆÝj ”m¥YÑ/Ô/ð 	4Ø˜AÔ×$Ò$ Z°¤]×%5Ò%5°eÑ%<Ô%<Ñ=Ô=Ð=Ð=à˜AÔ×$Ò$ Z°¤]Ñ3Ô3Ð3Ð3ð
 _Š_ÑÔð +ð +ˆØ'¨Ô*ˆ
1‰ˆð+ð +s$   Ã!DÄDÄ
DÄ7EÅEÅ"EÚmodulec                 óº   — t           r't          j        dk    rt          d¦  «        ‚t          }nt
          }|                      t          |||¬¦  «        ¦  «         dS )a}  
    Register the reparametrize state dict hooks to the module and its submodules.

    This function is a wrapper that is meant to toggle between the low_cpu_ram
    and regular versions of the ``reparametrize_as_dtype`` state dict hooks.

    Args:
        module (nn.Module): the module to register the hooks to.
        dtype (torch.dtype): the dtype to restore the weight to. Default is ``torch.bfloat16``.
        offload_to_cpu (bool): whether to offload the restored weight to CPU. Default is ``True``.

    Raises:
        RuntimeError: If the low RAM reparametrize hook is used on Windows or an incompatible torch version.
    Úwin32zPLow RAM reparametrize_as_dtype_state_dict_post_hook is not supported on Windows.r   N)r   ÚsysÚplatformÚRuntimeErrorrQ   r   Ú_register_state_dict_hookr   )rR   r   r   Úhooks       r   Ú(_register_reparametrize_state_dict_hooksrZ   Î   sq   € õ& ð 	;ÝŒ<˜7Ò"Ð"åØbñô ð õ HˆDˆDå:ˆØ
×$Ò$Ý˜E°.ÐAÑAÔAñô ð ð ð r   r.   c              #   ó6  K  — |                       ¦   «         st          d¦  «        ‚|                      ¦   «         st          d¦  «         |                      ¦   «         D ]-}t          |d¦  «        rt          |j        ¦  «        rd|_        Œ.	 dV — |                      ¦   «         D ]-}t          |d¦  «        rt          |j        ¦  «        rd|_        Œ.dS # |                      ¦   «         D ]-}t          |d¦  «        rt          |j        ¦  «        rd|_        Œ.w xY w)aí  
    This context manager temporarily disables KV-cacheing on a given model, which must already
    already have KV-caches setup. All forward passes using the model within this context manager
    will not use KV-caches.

    KV-caches will be disabled when entering the context manager, and will be enabled upon exit,
    without being modified.

    This is useful in cases where we might wish to alternate between model calls which use KV-cacheing,
    and model calls which do not use KV-cacheing, without the additional overhead of deleting and setting caches up
    every time.

    Example:
        >>> from torchtune.models.llama3_2 import llama3_2_1b
        >>> from torchtune.modules import disable_kv_cache
        >>> import torch
        >>> model = llama3_2_1b()
        >>> # setup caches
        >>> model.setup_caches(batch_size=1,
        >>>                     dtype=torch.float32,
        >>>                     decoder_max_seq_len=1024)
        >>> print(model.caches_are_setup())
        True
        >>> print(model.caches_are_enabled())
        True
        >>> print(model.layers[0].attn.kv_cache)
        KVCache()
        >>> # now temporarily disable caches
        >>> with disable_kv_cache(model):
        >>>     print(model.caches_are_setup())
        True
        >>>     print(model.caches_are_enabled())
        False
        >>>     print(model.layers[0].attn.kv_cache)
        KVCache()
        >>> # caches are now re-enabled, and their state is untouched
        >>> print(model.caches_are_setup())
        True
        >>> print(model.caches_are_enabled())
        True
        >>> print(model.layers[0].attn.kv_cache)
        KVCache()

    Args:
        model (nn.Module): model to disable KV-cacheing for.

    Yields:
        None: Returns control to the caller with KV-caches disabled on the given model.

    Raises:
        ValueError: If the model does not have caches setup. Use :func:`~torchtune.modules.TransformerDecoder.setup_caches` to
            setup caches first.
    zrModel caches must be setup before calling disable_kv_cache! Please use model.setup_caches() to setup model caches.zˆYou are using disable_kv_cache with a model that does not have caches enabled. This is a no-op and the expected behaviour may not occur.Úkv_cacheFNT)	Úcaches_are_setupÚ
ValueErrorÚcaches_are_enabledr	   ÚmodulesÚhasattrÚcallabler\   Úcache_enabled©r   rR   s     r   Údisable_kv_cachere   ð   sM  è è € ðn ×!Ò!Ñ#Ô#ð 
ÝðEñ
ô 
ð 	
ð ×#Ò#Ñ%Ô%ð 
Ýðñ	
ô 	
ð 	
ð
 —-’-‘/”/ð )ð )ˆÝ6˜:Ñ&Ô&ð 	)­8°F´OÑ+DÔ+Dð 	)Ø#(ˆFÔ øð,Øˆˆˆà—m’m‘o”oð 	,ð 	,ˆFÝv˜zÑ*Ô*ð ,­x¸¼Ñ/HÔ/Hð ,Ø'+Ô$øð	,ð 	,øe—m’m‘o”oð 	,ð 	,ˆFÝv˜zÑ*Ô*ð ,­x¸¼Ñ/HÔ/Hð ,Ø'+Ô$øð	,øøøs   ÂC ÃAD©Úencoder_max_seq_lenÚdecoder_max_seq_lenÚ
batch_sizeÚdevicerg   rh   c             #   ó  K  — |                       ¦   «         rt          d¦  «        ‚|5  |                      ||||¬¦  «         ddd¦  «         n# 1 swxY w Y   	 dV — t          | ¦  «         dS # t          | ¦  «         w xY w)a›  
    This context manager temporarily enables KV-cacheing on a given model, which does not
    already have KV-caches setup. All forward passes using the model within this context manager
    will use KV-caches.

    KV-caches will be set-up with the given ``batch_size``, ``dtype``, and ``max_seq_len`` when
    entering the context manager, and will be deleted on exit.

    Example:
        >>> from torchtune.models.llama3_2 import llama3_2_1b
        >>> from torchtune.modules import local_kv_cache
        >>> import torch
        >>> model = llama3_2_1b()
        >>> print(model.caches_are_setup())
        False
        >>> print(model.caches_are_enabled())
        False
        >>> print(model.layers[0].attn.kv_cache)
        None
        >>> # entering cacheing mode
        >>> with local_kv_cache(model,
        >>>                     batch_size=1,
        >>>                     device=torch.device("cpu"),
        >>>                     dtype=torch.float32,
        >>>                     decoder_max_seq_len=1024):
        >>>     print(model.caches_are_setup())
        True
        >>>     print(model.caches_are_enabled())
        True
        >>>     print(model.layers[0].attn.kv_cache)
        KVCache()
        >>> # exited cacheing mode
        >>> print(model.caches_are_setup())
        False
        >>> print(model.caches_are_enabled())
        False
        >>> print(model.layers[0].attn.kv_cache)
        None

    Args:
        model (nn.Module): model to enable KV-cacheing for.
        batch_size (int): batch size for the caches.
        device (torch.device): device to setup caches on. this should be the same device
            the model is on.
        dtype (torch.dtype): dtype for the caches.
        encoder_max_seq_len (Optional[int]): maximum encoder cache sequence length.
        decoder_max_seq_len (Optional[int]): maximum decoder cache sequence length.

    Yields:
        None: Returns control to the caller with KV-caches setup and enabled on the given model.

    Raises:
        ValueError: If the model already has caches setup.
            You may use :func:`~torchtune.modules.common_utils.delete_kv_caches` to delete existing caches.
    zModel caches must be not setup prior to entering this context manager! Please use delete_kv_caches(model) to delete model caches.rf   N)r]   r^   Úsetup_cachesÚdelete_kv_caches)r   ri   rj   r   rg   rh   s         r   Úlocal_kv_cachern   =  sò   è è € ðB ×ÒÑÔð 
ÝðIñ
ô 
ð 	
ð
 
ð 
ð 
Ø×ÒØØØ 3Ø 3ð	 	ñ 	
ô 	
ð 	
ð
ð 
ð 
ñ 
ô 
ð 
ð 
ð 
ð 
ð 
ð 
øøøð 
ð 
ð 
ð 
ð Øˆˆˆå˜ÑÔÐÐÐøÕ˜ÑÔÐÐøøøs   ¨AÁAÁAÁA/ Á/B c                 óÞ   — |                       ¦   «         st          d¦  «        ‚|                      ¦   «         D ]4}t          |d¦  «        r"t	          |j        ¦  «        rd|_        d|_        Œ5dS )a„  
    Deletes KV caches from all attention layers in a model,
    and also ensures ``cache_enabled`` is set to False.

    Example:
        >>> from torchtune.models.llama3_2 import llama3_2_1b
        >>> from torchtune.modules import delete_kv_caches
        >>> import torch
        >>> model = llama3_2_1b()
        >>> model.setup_caches(batch_size=1,
        >>>                     dtype=torch.float32,
        >>>                     decoder_max_seq_len=1024)
        >>> print(model.caches_are_setup())
        True
        >>> print(model.caches_are_enabled())
        True
        >>> print(model.layers[0].attn.kv_cache)
        KVCache()
        >>> delete_kv_caches(model)
        >>> print(model.caches_are_setup())
        False
        >>> print(model.caches_are_enabled())
        False
        >>> print(model.layers[0].attn.kv_cache)
        None

    Args:
        model (nn.Module): model to enable KV-cacheing for.

    Raises:
        ValueError: if this function is called on a model which does not have
            caches setup. Use :func:`~torchtune.modules.TransformerDecoder.setup_caches` to
            setup caches first.
    zuYou have tried to delete model caches, but model.caches_are_setup() is False! Please setup caches on the model first.r\   FN)r]   r^   r`   ra   rb   r\   rc   rd   s     r   rm   rm   ‘  s‡   € ðF ×!Ò!Ñ#Ô#ð 
Ýð@ñ
ô 
ð 	
ð —-’-‘/”/ð #ð #ˆÝ6˜:Ñ&Ô&ð 	#­8°F´OÑ+DÔ+Dð 	#Ø#(ˆFÔ Ø"ˆFŒOøð#ð #r   )(Ú
contextlibr@   rU   Úcollectionsr   Ú	functoolsr   Útypingr   r   r   r   Úwarningsr	   rC   Útorch.nnÚnnÚtorch._subclasses.fake_tensorr
   r   Útorchao.dtypes.nf4tensorr   r   ÚboolÚ__annotations__Úbfloat16ÚModuleÚstrr   r   r'   Úlistr=   rQ   rZ   Úcontextmanagerre   rj   rn   rm   r&   r   r   ú<module>r€      s9  ðð Ð Ð Ð Ð Ø €€€Ø 
€
€
€
Ø #Ð #Ð #Ð #Ð #Ð #Ø Ð Ð Ð Ð Ð Ø 1Ð 1Ð 1Ð 1Ð 1Ð 1Ð 1Ð 1Ð 1Ð 1Ð 1Ð 1Ø Ð Ð Ð Ð Ð à €€€à Ð Ð Ð Ð Ð Ø MÐ MÐ MÐ MÐ MÐ MÐ MÐ MØ .Ð .Ð .Ð .Ð .Ð .àÐ $Ð Ð Ñ ð œØð#4ð #4ð #4ØŒ9ð#4àS˜#X”ð#4ð ð#4ð Œ;ð	#4ð
 ð#4ð ð#4ð #4ð #4ð #4ðLG #ð G¨sð G°t¸D´zð Gð Gð Gð Gð\ œØðC+ð C+ð C+ØŒ9ðC+àS˜#X”ðC+ð ðC+ð Œ;ð	C+ð
 ðC+ð ðC+ð C+ð C+ð C+ðP œØðð ØŒIðàŒ;ðð ðð ð ð ðD ÔðI,˜BœIð I,¨)Ð4DÔ*Eð I,ð I,ð I,ñ ÔðI,ðX Ôð *.Ø)-ðP ð P ð P ØŒ9ðP ð ðP ð ŒLð	P ð
 Œ;ðP ð " #œðP ð " #œðP ð ÐÔ ðP ð P ð P ñ ÔðP ðf+#˜BœIð +#ð +#ð +#ð +#ð +#ð +#r   