
    `i%?                        d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZmZmZ d dlmZ  e j        e          Zdeeef         fdZd	ej        defd
Z G d de          Z G d de          ZdS )    N)abcdefaultdict)Iterable)AnyOptionaloverloadUnion)_MultiDeviceReplicator
GradScalerOptState)ProcessGroupreturnc                       t           j        i dS )N)stagefound_inf_per_device)r   READY     ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/distributed/fsdp/sharded_grad_scaler.py_refresh_per_optimizer_stater      s    ^R@@@r   tensorc                 p    | j         p/| j        j        dddddt          j                                        fv S )Nxlacpuhpumtiaxpu)is_cudadevicetypetorch_C_get_privateuse1_backend_name)r   s    r   _is_supported_devicer$      sB    > V]/..004  r   c                   ,    e Zd ZdZdej        ddfdZdS )_GeneralMultiDeviceReplicatorz
    Lazily serves tensor to request device. This class extends
    _MultiDeviceReplicator to allow support for "cpu" as a device.
    master_tensorr   Nc                 D    t          |          sJ || _        i | _        d S N)r$   master_per_device_tensors)selfr'   s     r   __init__z&_GeneralMultiDeviceReplicator.__init__%   s+    #M22222#EG   r   )__name__
__module____qualname____doc__r!   Tensorr-   r   r   r   r&   r&      sJ         
Hel Ht H H H H H Hr   r&   c                       e Zd ZdZddddddej        j        fded	ed
edede	de
dee         ddf fdZedej        dej        fd            Zedeej                 deej                 fd            Zedeej        df         deej        df         fd            Zedeej                 deej                 fd            Zdeej        eej                 f         deej        eej                 f         fdZ	 d"dej        j        dej        dej        de
deej        ej        f         f
dZdej        j        ddfdZdej        ddfdZd#d eeeej        f                  ddfd!Z xZS )$ShardedGradScaleraA	  
    ShardedGradScaler helps perform gradient scaling in a shard aware manner. It extends
    functionality from GradScaler:
    * Supports Pytorch DDP and FSDP implementations
    * Support CPU offloaded tensors (as used in fully sharded data parallel[FSDP])
    * Supports the custom Mixed Precision loss dtype (fp16, bf16) that FSDP returns
    * Sync inf/nan for scaled gradient tensors on any torch.device (where tensors are placed) across
    nodes

    Example::

        # Creates a ShardedGradScaler once at the beginning of training.
        scaler = ShardedGradScaler()

        for epoch in epochs:
            for input, target in data:
                optimizer.zero_grad()
                output = model(input)
                loss = loss_fn(output, target)

                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
                scaler.scale(loss).backward()

                # scaler.step() first unscales gradients of the optimizer's params.
                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
                # otherwise, optimizer.step() is skipped.
                scaler.step(optimizer)

                # Updates the scale for next iteration.
                scaler.update()

    See :class:`GradScaler` for explanation of scaling/unscaling and more use cases.

    Args:
        init_scale (float, optional, default=2.**16):  Initial scale factor.
        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
            :meth:`update` if inf/NaN gradients occur in an iteration.
        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
            that must occur for the scale to be multiplied by ``growth_factor``.
        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
            Default: ``True``
        process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
            process group for sharding
    cudag      @g      ?g       @i  Tr   
init_scalebackoff_factorgrowth_factorgrowth_intervalenabledprocess_groupr   Nc                     t                                          ||||||           | j        r"|| _        t	          t
                    | _        d S d S )N)r6   r7   r8   r9   r:   )superr-   _enabledr;   r   r   _per_optimizer_states)	r,   r   r6   r7   r8   r9   r:   r;   	__class__s	           r   r-   zShardedGradScaler.__init__\   sp     	!)'+ 	 	
 	
 	
 = 	S!.D)45Q)R)RD&&&	S 	Sr   outputsc                     d S r)   r   r,   rA   s     r   scalezShardedGradScaler.scaler   s    <?Cr   c                     d S r)   r   rC   s     r   rD   zShardedGradScaler.scaleu   s    HKr   .c                     d S r)   r   rC   s     r   rD   zShardedGradScaler.scalex   s    TWTWr   c                     d S r)   r   rC   s     r   rD   zShardedGradScaler.scale{   s    PSPSr   c                      j         s|S t          |t          j                  ryt	          |          sJ  j                             |j                    j        J | j                            |j        d          z  }|	                    |j
                  S g dt          t          j        t          t          j                 f         f fd |          S )NTr   non_blockingvalc                 z   t          | t          j                  rt          |           sJ t	                    dk    rQj                            | j                   j        J                     t          j                             | d         
                    | j                  z  }|                    | j                  S t          | t          j                  rFt          |           }t          | t           t"          f          r t          |           |          S |S t%          d          )Nr   z2outputs must be a Tensor or an iterable of Tensors)
isinstancer!   r2   r$   len_scale_lazy_init_scale_growth_trackerr   appendr&   getr    dtyper   r   maplisttuple
ValueError)rK   
scaled_valiteratorapply_scaler,   stashs      r   rZ   z,ShardedGradScaler.scale.<locals>.apply_scale   s   #u|,, 2+C00000u::??{*<<SZHHH;222LL!>t{!K!KLLL 58<<
#;#;;
 "sy111#s|,,  {C00cD%=11 /$499X...QRRRr   )r>   rM   r!   r2   r$   rO   rP   r   tor    rS   r	   r   )r,   rA   scaled_outputrZ   r[   s   `  @@r   rD   zShardedGradScaler.scale~   s    } 	Ngu|,, 	5'00000{"44W^DDD;***#dknn~D '5 ' ' M !%%gm44457	SU5<%,1G#GH 	S 	S 	S 	S 	S 	S 	S 	S( {7###r   	optimizer	inv_scale	found_inf
allow_fp16c           
      (   t          |          }t          |          }t          d           }t          j                    5  |j        D ]}|d         D ] }	|	j        |s)|	j        j        t          j        k    rt          d          |	j        j	        r|	j        j        t          j        u rZ|	j        
                    t          j                                                  }
|

                    t          j                  |	_        |	j                                        }n|	j        }||j                 |j                                     |           |                                D ]X\  }}|                                D ]>}t          j        ||                    |          |                    |                     ?Y	 d d d            n# 1 swxY w Y   |j        s(| j        J |                    | j        j                   |j        S )Nc                  *    t          t                    S r)   )r   rU   r   r   r   <lambda>z3ShardedGradScaler._unscale_grads_.<locals>.<lambda>   s    T9J9J r   paramsz%Attempting to unscale FP16 gradients.)r&   r   r!   no_gradparam_groupsgradrS   float16rW   	is_sparser    float32coalesce_valuesr   rQ   itemsvalues*_amp_foreach_non_finite_check_and_unscale_rR   r+   rO   )r,   r^   r_   r`   ra   per_device_inv_scaleper_device_found_infper_device_and_dtype_gradsgroupparamparam_grad_fp32
to_unscaler   per_dtype_gradsgradss                  r   _unscale_grads_z!ShardedGradScaler._unscale_grads_   sV     =YGG<YGG &11J1J%K%K"]__ 	 	"/ ) )"8_ ) )Ez) & REJ,<,M,M()PQQQz+ 0
 !:+u}<<.3jooem.L.L.U.U.W.WO)8)=)=em)L)LEJ%*Z%7%7%9%9

%*Z
.z/@A"(fZ(((()), ,F+K+K+M+M  ',3355  ED,0088,0088   1	 	 	 	 	 	 	 	 	 	 	 	 	 	 	D $7 	9;*** $$T[%7888#77s   FGGGc                    | j         sd S |                     d           | j        t          |                   }|d         t          j        u rt          d          |d         t          j        u rt          d          | j        J | j        	                                
                                                                }t          j        ddt          j        | j        j                  }|                     |||d          |d	<   t          j        |d<   | j        t          |                   }g }g }g }|d	                                         D ]}| j        d
k    r|j        j        d
k    rt|                    |           |                    | j                  }|                    |           |                    t-          j        |d| j                             |                    t-          j        |d| j                             |D ]}	|	                                 |rt          j        ||           d S d S )Nunscale_r   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step().)   g        )rS   r   Tr   r   )async_oprt   )r>   _check_scale_growth_trackerr?   idr   UNSCALEDRuntimeErrorSTEPPEDrO   double
reciprocalfloatr!   fullrk   r   rz   ro   _devicer    rQ   r\   dist
all_reducer;   wait_foreach_copy_)
r,   r^   optimizer_stater_   r`   worksfound_inf_on_cpusfound_inf_on_devicesfound_inf_on_deviceworks
             r   r|   zShardedGradScaler.unscale_   sx   } 	F((4444R	]]C7#x'888_   W%)999IJJJ {&&&K&&((3355;;==	J#U]4;3E
 
 
	 372F2Fy)T3
 3
./ $,#4  4R	]]C!()?@GGII 	 	I|u$$)9)>%)G)G!((333&/ll4<&@&@#$++,?@@@O+d$BT      OIDDVWWW     	 	DIIKKKK 	J !24HIIIII	J 	Jr   c                 R   | j         | j        J |                                dk    r1| xj         | j        z  c_         | j                            d           dS | j        dz   }|| j        k    r1| xj         | j        z  c_         | j                            d           dS || _        dS )z
        If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
        Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
        Ng      ?r   r}   )rO   _growth_trackeritem_backoff_factorfill__growth_interval_growth_factor)r,   r`   
successfuls      r   _amp_update_scale_cpu_z(ShardedGradScaler._amp_update_scale_cpu_  s    
 {&4+?+K+KK>>s""KK4//KK &&q)))))-1JT222t22$**1-----'1$$$r   	new_scalec                    | j         sdS |                     d          \  }|t          |t                    r| j                            |           nQd}|j        j        | j        k    s
J |            |	                                dk    s
J |            |j
        du s
J |            | j                            |           nfd| j                                        D             }t          |          dk    s
J d            |d         }t          |          dk    r+t          dt          |                    D ]}|||         z  }j        j        d	k    r|                     |           n2t#          j        | j        | j        || j        | j        | j                   t/          t0                    | _        dS )
a  
        Updates the scale factor.
        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
        the scale is multiplied by ``growth_factor`` to increase it.
        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
        used directly, it's used to fill GradScaler's internal scale tensor. So if
        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
        affect the scale GradScaler uses internally.)
        Args:
            new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
        .. warning::
            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
            been invoked for all optimizers used this iteration.
        Nupdatezpnew_scale should be a float or a 1-element torch.cuda.FloatTensor or torch.FloatTensor with requires_grad=False.r}   Fc                     g | ];}|d                                           D ]}|                    j        d          <S )r   TrI   )ro   r\   r   ).0stater`   rO   s      r   
<listcomp>z,ShardedGradScaler.update.<locals>.<listcomp>M  sd       !&'=!>!E!E!G!G   FMEE   r   r   z,No inf checks were recorded prior to update.r   )r>   r   rM   r   rO   r   r   r    r   numelrequires_gradcopy_r?   ro   rN   ranger   r!   _amp_update_scale_r   r   r   r   r   r   )r,   r   r   reason
found_infsfound_inf_combinedirO   s          @r   r   zShardedGradScaler.update'  s   " } 	F"&"B"B8"L"L )U++ 
-!!),,,,B  !',<<<f<<< ((A---v--- .%777777!!),,,,   !7>>@@  J z??Q&&&(V&&&!+A:""q#j//22 8 8A&*Q-7&&}!U**++,>????(K(&'()   &11M%N%N"""r   )Tr)   )r.   r/   r0   r1   r   rt   WORLDstrr   intboolr   r   r-   r   r!   r2   rD   rU   rV   r   r	   optim	Optimizerdictr   rz   r|   r   r   __classcell__)r@   s   @r   r4   r4   +   s       . .d # #"#04
0@S SS S 	S
 S S S  -S 
S S S S S S, ?U\?el??? X?KT%,/KD4FKKK XKWU5<#45W%c@Q:RWWW XWSXel3S8NSSS XS)$U\8EL+AAB)$	u|Xel33	4)$ )$ )$ )$`  68 68;(68 <68 <	68
 68 
elEL(	)68 68 68 68p2J%+"7 2JD 2J 2J 2J 2Jh2 2 2 2 2 2$@O @Ouel/B)C D @OPT @O @O @O @O @O @O @O @Or   r4   ) loggingcollectionsr   r   collections.abcr   typingr   r   r   r	   r!   torch.distributeddistributedr   torch.amp.grad_scalerr
   r   r   "torch.distributed.distributed_c10dr   	getLoggerr.   loggerr   r   r   r2   r   r$   r&   r4   r   r   r   <module>r      s    ( ( ( ( ( ( ( ( $ $ $ $ $ $ 1 1 1 1 1 1 1 1 1 1 1 1              N N N N N N N N N N ; ; ; ; ; ; 
	8	$	$Ad38n A A A A $    	H 	H 	H 	H 	H$: 	H 	H 	H|O |O |O |O |O
 |O |O |O |O |Or   