
    `is              	          d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlmZ d dlZd dlmZ d dlmZ d dl mc m!Z" d d	l#m$Z$ d d
l%m&Z& d dl'm(Z(m)Z)m*Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z< d dl=m>Z>m?Z?m@Z@ d dlAmBZBmCZCmDZDmEZE d dlmFZFmGZG d dlHmIZJ d dlKmLZLmMZMmNZNmOZO d dlPmQZQmRZRmSZSmTZTmUZU d dlVmWZW dZXeSrdZYdZZej[        \                                ZXn-eTrdZYdZZn&eUrdZYdZZej]        \                                ZXnd ZYd!ZZd"ZX G d# d$e          Z^ G d% d&e          Z_ G d' d(ej`        e          Zad)ej`        d*ejb        d+efd,Zcd- Zd	 	 dxd)ej`        d0eefd1Zfdyd2Zgd3 Zhd4 Zidzd)ej`        d5eefd6Zjd)ej`        d7eefd8Zkd)ej`        d9eefd:Zl G d; d<          Zm G d= d>ea          Zn G d? d@ea          Zo G dA dBeo          Zp G dC dDeo          Zq G dE dFea          Zr G dG dHer          Zs G dI dJej`                  Zt G dK dLeo          Zu G dM dNej`                  Zv G dO dPejw                  Zx G dQ dRej`                  Zye jz        dSefdT            Z{e jz        dUefdV            Z|e jz        dWefdX            Z}ee jz        dYefdZ                        Z~ee jz        d[efd\                        Zee jz        d]efd^                        Zee jz        d_efd`                        Zdaed+edbedcefddZ	 d{dfej`        dgej`        dheedif         fdjZ ej        eUdk           G dl dmeM                      Z G dn doeL          Zd|dpee         fdqZ G dr dsej`                  Z G dt duej`                  Z G dv dwej`                  ZdS )}    N)ABCabstractmethod)nullcontext)deepcopy)autoEnumwraps)AnyCallablecastno_type_checkOptionalUnion)mock)
checkpoint)
DeviceMesh)
CPUOffloadfully_shardFullyShardedDataParallel)TrainingState)FSDPParamGroupRegisterPostBackwardFunction)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)distribute_tensorDTensorShard)ColwiseParallelparallelize_moduleRowwiseParallelSequenceParallel)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcessTestCaseMultiThreadedTestCaserun_subtests
TEST_SKIPS)FILE_SCHEMAget_cycles_per_ms	TEST_CUDATEST_HPUTEST_XPU)
has_triton   cudancclzhpu:0hcclxpuxcclcpugloo   c                   6    e Zd Z e            Z e            ZdS )FSDPInitModeN)__name__
__module____qualname__r   NO_FSDP	RECURSIVE     w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/testing/_internal/common_fsdp.pyr@   r@   W   s$        dffGIIIrG   r@   c                   J    e Zd Z e            Z e            Z e            ZdS )DEVICEInitModeN)rA   rB   rC   r   DEVICE_BEFOREDEVICE_AFTERDEVICE_NEVERrF   rG   rH   rJ   rJ   `   s/        DFFM466L466LLLrG   rJ   c                       e Zd ZdZedeej        df         fd            Zedej        fd            Z	edd            Z
eeded	edej        fd
                        ZdS )FSDPTestModelzZThis defines the interface expected from all models used commonly for
    FSDP unit tests.return.c                     dS )z+Returns an input for the model as as tuple.NrF   selfdevices     rH   	get_inputzFSDPTestModel.get_inputm   	     	rG   c                     dS )z,Returns the loss given the input and output.NrF   )rS   inputoutputs      rH   get_losszFSDPTestModel.get_lossr   rV   rG   Nc                     dS )z<Runs the backward pass (e.g. including ``loss.backward()``).NrF   rS   losss     rH   run_backwardzFSDPTestModel.run_backwardw   rV   rG   argskwargsc                      dS )z&Initializes an instance of this model.NrF   )r_   r`   s     rH   initzFSDPTestModel.init|   s	     	rG   rP   N)rA   rB   rC   __doc__r   tupletorchTensorrU   rZ   r^   staticmethodr   nnModulerb   rF   rG   rH   rO   rO   i   s          5s):#;    ^     ^    ^ C 3 29    ^ \  rG   rO   modelprocess_group	assert_fnc                    d |                                  D             }|d |                                 D             z  }t          j        |          }d t	          |          D             }t          j        |||           |d         }|J |dd         D ].}|J t          ||          D ]\  \  }}	\  }}
 ||	|
           /dS )a  
    All-gathers module states across ranks and calls ``assert_fn`` on each pair
    of corresponding states from rank 0 and a nonzero rank. For example, if
    ``assert_fn`` is ``self.assertEqual()``, then this checks that all module
    states are equal across ranks.
    c                 d    g | ]-\  }}||                                                                 f.S rF   detachr<   ).0
param_nameparams      rH   
<listcomp>z)_assert_module_states.<locals>.<listcomp>   sE       J 
U\\^^''))*  rG   c                 d    g | ]-\  }}||                                                                 f.S rF   rp   )rr   buffer_namebuffers      rH   ru   z)_assert_module_states.<locals>.<listcomp>   sE       K 
fmmoo))++,  rG   c                     g | ]}d S NrF   )rr   _s     rH   ru   z)_assert_module_states.<locals>.<listcomp>   s    ---aT---rG   groupr   Nr>   )named_parametersnamed_buffersdistget_world_sizerangeall_gather_objectzip)rk   rl   rm   named_module_states
world_sizeolistrank0_statesstater{   p1p2s              rH   _assert_module_statesr      s*    !&!7!7!9!9     #(#6#6#8#8    $]33J--5,,---E5"5]KKKK8L###qrr      #L% 8 8 	 	GQWaIb"	 rG   c                  4    t          j        t                    S rz   )rf   rT   DEVICE_TYPErF   rG   rH   get_devtyper      s    <$$$rG   FTzero_buffersc                    |rt          j        |           nt                      }|5  |                                 D ]A}t	          j                    5  |                                 ddd           n# 1 swxY w Y   B|rV|                                 D ]A}t	          j                    5  |                                 ddd           n# 1 swxY w Y   Bddd           dS # 1 swxY w Y   dS )zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersrf   no_gradzero_buffers)rk   r   summon_fullctxrt   rx   s         rH   _zero_modelr      s    -8
J$
!%
(
(
([]]C	 # #%%'' 	 	E                 	#--// # #]__ # #LLNNN# # # # # # # # # # # # # # ## # # # # # # # # # # # # # # # # #sY   *C#A2&C#2A66C#9A6:/C#)C
>C#
CC#CC##C'*C'c                     |s|                      t                    } |r|                                  |                                 S rz   )tor   half
state_dict)rk   cpu_offloadr   s      rH   _get_state_dictr      sB     &%% 

rG   c                 F     d                      fd|D                       S )Nr{   c                 B    g | ]}|t          |                   ndS )Nnone)str)rr   stest_name_mappings     rH   ru   z subtest_name.<locals>.<listcomp>   s.    NNNAam	3q66	"	"NNNrG   )join)r   r_   s   ` rH   subtest_namer      s0    88NNNNNNN  rG   c                    |                                 D ]9\  }}|j        t          j        d          k    r|                                ||<   :| dk    r|nd g}t	          j        |           t          t          t          t          j	        f         |d                   }|
                                D ]%}||                             t                    ||<   &|S )Nr<   r   )itemsrT   rf   r<   r   broadcast_object_listr   dictr   rg   keysr   r   )rankr   rs   rt   r   s        rH   _broadcast_state_dictr      s     (--// 1 1
E<5<....%*YY[[Jz"199ZZ$/Eu%%%d3,-uQx88J oo'' H H
!+J!7!:!:;!G!G
:rG   recursec                     t          j        | |          5  t          t          |                                                     cddd           S # 1 swxY w Y   dS )a[  
    Returns the full unsharded parameters of ``model``. Any FSDP-managed
    parameters offloaded to CPU are moved to GPU in the returned list.

    Args:
        recurse (bool): If ``False``, only unshards the parameters immediate to
            ``model``; if ``True``, recurses through the module hierarchy
            rooted at ``model``.
    )r   N)r   r   r   listr   )rk   r   s     rH   get_full_paramsr      s     
	 	8	8	8 2 2U--//00112 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2s   .AAAmove_to_devicec                 >    |r|                      t                    n| S rz   )r   r   )rk   r   s     rH   _move_to_devicer      s    $2=588K   =rG   	wrap_fsdpc                 *    |s| nt          | g|R i |S rz   r   )rk   r   r_   r`   s       rH   _maybe_wrap_fsdpr      s)    !C55tE'CD'C'C'CF'C'CCrG   c                   <    e Zd ZdedefdZdefdZdefdZd ZdS )	DummyProcessGroupr   sizec                 "    || _         || _        d S rz   )_rank_size)rS   r   r   s      rH   __init__zDummyProcessGroup.__init__   s    



rG   rP   c                     | j         S rz   )r   rS   s    rH   r   zDummyProcessGroup.rank   
    zrG   c                     | j         S rz   )r   r   s    rH   r   zDummyProcessGroup.size   r   rG   c                 @    t          j                    }d }||_        |S )Nc                  l    t           j                                        } |                     d           | S )Nr>   )rf   futuresFuture
set_result)futures    rH   
get_futurez/DummyProcessGroup.allreduce.<locals>.get_future   s.    +0=+?+?+A+AFa   MrG   )r   Mockr   )rS   r_   r`   	dist_waitr   s        rH   	allreducezDummyProcessGroup.allreduce   s,    IKK		 	 	
  *	rG   N)rA   rB   rC   intr   r   r   r   rF   rG   rH   r   r      s{        S     c    c    	 	 	 	 	rG   r   c                        e Zd Zdej        dededef fdZd Zd Z	d Z
d	 Ze	 	 	 ddej        dededeeeef                  dededeej        ef         fd            Zd Z xZS )TransformerWithSharedParamsr}   device_init_modeadd_bndeterministicc                    t                                                       |                                | _        |                                | _        |rt          j        d           d}d}t          j        ||          | _	        t          j
        |dddd          | _        t          j        ||          | _        | j	        j        | j        _        |                     d| j	        j                            |f                     |                     d	t          j        | j        t
          j        
                     d| _        |r$t
          j                            | j                  nt
          j                                        | _        |t0          j        k    r|                     t6                    } |r|                                  d S d S )Nr               g?)d_modelnum_encoder_layersnum_decoder_layersdim_feedforwarddropout
vocab_biaslong_buffer)dtype)superr   r   r   r   rf   manual_seedri   	Embeddingembed_tokensTransformertransformerLinearoutput_projweightregister_buffernew_ones
zeros_liker   longbsBatchNorm1dIdentitybnrJ   rK   r   r   eval)rS   r}   r   r   r   d_vocabr   	__class__s          rH   r   z$TransformerWithSharedParams.__init__  s    	JJLL	**,, 	!a   L'::>  
 
 
 9Wg66 #'"3":$+2;;WJGG	
 	
 	
 	T_EJ???	
 	
 	

 39R%(&&tw///ux?P?P?R?R~;;;77;''D 	IIKKKKK	 	rG   c                    t          j        d| j        z              t          j        d|                              d| j                  }t          j        | j        dz  |                              d| j                  }||fS )Nr>      rT      r6   )rf   r   r   arangeviewr   )rS   rT   srctgts       rH   rU   z%TransformerWithSharedParams.get_input/  su    !di-(((l2f---221dg>>l47Q;v666;;AtwGGSzrG   c                     |                      |          }|| j        z   | j                            |          z   }|                      |          }|                     |          }|                     ||          }|                     |          S rz   )r   r   r   type_asr   r   r   )rS   src_idstgt_idsr   r   xs         rH   forwardz#TransformerWithSharedParams.forward5  s    ((DO#d&6&>&>s&C&CC((ggcllS#&&"""rG   c                     |\  }}t           j                            |                    d|                    d                    |                    d          d          S )Nsum)	reduction)ri   
functionalcross_entropyr   r   )rS   rX   rY   r{   r   s        rH   rZ   z$TransformerWithSharedParams.get_loss=  sS    3}**KKFKKOO,,chhrlle + 
 
 	
rG   c                 .    |                                  d S rz   backwardr\   s     rH   r^   z(TransformerWithSharedParams.run_backwardC      rG   NFTfsdp_init_modefsdp_kwargsrP   c                    |i }|t           j        k    r2t          | t                    r	| d         }n| }t	          ||||          S |t           j        k    rd|vrt          t          t          h          }n|	                    d          }d|v r8|d         t          j        t          j        hv rt          | t                    sd}n| }t          | t                    r	| d         }	n| }	t	          |	|||          }
t          |
|fd|i|}|t          j        k    r|                    t"                    }|S t%          d|           )au  
        Initializes a :class:`TransformerWithSharedParams` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps with
                top-level FSDP. By default, the top-level FSDP uses the
                ``ModuleWrapPolicy`` for encoder and decoder layers, but a
                different auto wrap policy may be specified via
                ``fsdp_kwargs``.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            add_bn (bool): Whether to include batch norm in the model.
        Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )r@   rD   
isinstancere   r   rE   r    r*   r)   popr   HYBRID_SHARD_HYBRID_SHARD_ZERO2r   rJ   rL   r   r   
ValueError)r}   r  r   r  r   r   pgr  fsdp_pg
tformer_pgm
fsdp_models               rH   rb   z TransformerWithSharedParams.initF  s   6 K\111%'' 1X.$fm   |555!44#3//$ $   $/??3E#F#F  ${22 34$13C3WXY Y"5%00Y %'' #"1X

"
+,fm A   "2 	 J  >#>>>']];77
HHHIIIrG   c                     | j         gS rz   )r   r   s    rH   get_ignored_modulesz/TransformerWithSharedParams.get_ignored_modules  s     !!rG   )NFT)rA   rB   rC   r   ProcessGrouprJ   boolr   rU   r  rZ   r^   rh   r@   r   r   r   r   r   ri   rj   r   rb   r   __classcell__r   s   @rH   r   r     sX       ( ( )( 	(
 ( ( ( ( ( (T  # # #
 
 
   
 15#KJ KJ KJ$KJ )KJ d38n-	KJ
 KJ KJ 
ry$	KJ KJ KJ \KJZ" " " " " " "rG   r   c                        e Zd Zdej        dededef fdZd Zd Z	d Z
d	 Ze	 	 ddej        dededeeeef                  dedej        fd            Z xZS )NestedWrappedModuler}   r   r   r   c                    t                                                                                       | _                                        | _        |t
          j        k    }fd}|rt          j        d           t          j
        t          t          j        dd          |           |t          j
         |t          t          j        dd          |                    t          t          j        dd          |                               |t          t          j        dd          |                    t          t          j        dd          |                    | _        d S )Nc                 (    rt          | fi S | S rz   r   layerr  r}   r   s    rH   _maybe_wrapz1NestedWrappedModule.__init__.<locals>._maybe_wrap  (     9E588K888LrG   r   r   r6   r   )r   r   r   r   r   rJ   rK   rf   r   ri   
Sequentialr   r   module	rS   r}   r   r   r   r  r   r+  r   s	    ``  `  rH   r   zNestedWrappedModule.__init__  sK    	JJLL	**,,)^-II	 	 	 	 	 	 	
  	!a   mBIaOO^<<KK	!R0@0@. Q QRR#BIb"$5$5~FF   K	"a(8(8.IIJJBIaOO^<<

 

rG   c                 j    t          j        d| j        z              t          j        dd|          fS )Nr>   r6   r   r   )rf   r   r   randrR   s     rH   rU   zNestedWrappedModule.get_input  s3    !di-(((
1a///11rG   c                 ,    |                      |          S rz   r.  rS   r  s     rH   r  zNestedWrappedModule.forward      {{1~~rG   c                 .    |                                 }|S rz   )r  rS   rX   rY   r]   s       rH   rZ   zNestedWrappedModule.get_loss  s    zz||rG   c                 .    |                                  d S rz   r  r\   s     rH   r^   z NestedWrappedModule.run_backward  r  rG   NFr  r  rP   c                    |i }|t           j        k    rt          | d||          S |t           j        k    r=t          | fd||d|}|t          j        k    r|                    t                    }|S t          d|           )a  
        Initializes a :class:`NestedWrappedModule` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP but not the top-level module. The model may
                later be wrapped with a top-level FSDP external to this method
                if desired.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
        NFr   r   r   Tr  )	r@   rD   r&  rE   rJ   rL   r   r   r  )r}   r  r   r  r   r  s         rH   rb   zNestedWrappedModule.init  s    . K\111&!1+	    |555,!1+	 
  J  >#>>>']];77
HHHIIIrG   NF)rA   rB   rC   r   r!  r"  rJ   r   rU   r  rZ   r^   rh   r@   r   r   r   r   ri   rj   rb   r#  r$  s   @rH   r&  r&    s)       
 
 
 )	

 
 
 
 
 
 
@2 2 2       
 15#+J +J +J$+J )+J d38n-	+J
 +J 
+J +J +J \+J +J +J +J +JrG   r&  c                   p     e Zd Ze	 	 d	dej        dededee	e
ef                  def
 fd            Z xZS )
AlwaysWrapNestedWrappedModuleNFr}   r  r   r  r   c                 L   t          t          t                                        | t          j        |||          }|t          j        k    r|S |t          j        k    rD|pi }t          |fdt          i|}|t          j	        k    r|
                    t                    }|S dS )z
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
        wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
        policy.
        )r}   r  r   r  r   r  N)r   r=  rb   r@   rD   rE   r   r   rJ   rL   r   r   )r}   r  r   r  r   rk   r  r   s          rH   rb   z"AlwaysWrapNestedWrappedModule.init  s     )+H
 

$'/-#'  
 
 	 \111L|555%+KeXX6HXKXXJ>#>>>']];77
 65rG   r;  )rA   rB   rC   rh   r   r!  r@   rJ   r   r   r   r   r"  rb   r#  r$  s   @rH   r=  r=    s        
 15#  $ ) d38n-	
      \    rG   r=  c                        e Zd Zdej        dededef fdZedd            Z	e	 	 ddej        d
e
dedeeeef                  def
d            Z xZS )NonUniformReqGradNWMr}   r   r   r   c                    t          t          |                                                                            | _                                        | _        |t          j        k    }fd}|rt          j	        d           t          j        t          t          j        dd          |           |t          j         |t          t          j        dd          |                    t          t          j        dd          |                               |t          j        t          t          j        dd          |          t          t          j        dd          |                                        | _        d S )Nc                 (    rt          | fi S | S rz   r   r)  s    rH   r+  z2NonUniformReqGradNWM.__init__.<locals>._maybe_wrap,  r,  rG   r   r   r6   r   )r   r&  r   r   r   r   rJ   rK   rf   r   ri   r-  r   r   r.  r/  s	    ``  `  rH   r   zNonUniformReqGradNWM.__init__  sh    	!4((11333 JJLL	**,,)^-II	 	 	 	 	 	 	
  	!a   mBIaOO^<<KK	!R0@0@. Q QRR#BIb"$5$5~FF   K#BIb!$4$4nEE#BIaOO^DD  
 
rG   rP   Nc                     |                                  D ]/\  }}t          j        ||          s|                    d           0d S r;  )r~   rematchrequires_grad_)rk   req_grad_masknps       rH   _set_nonuniform_req_gradz-NonUniformReqGradNWM._set_nonuniform_req_gradC  sT    **,, 	( 	(DAq8M1-- (  '''	( 	(rG   Fr  r  c                    t          j        d          }|t          j        k    r0t	          | d||          }t                              ||           |S |t          j        k    r\|i }t	          | fd||d|}|t          j        k    r|	                    t                    }t                              ||           |S t          d|           )a  
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
        container to enable the desired non-uniform ``requires_grad``
        ``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
        init modes, freezes all parameters except the last two to validate
        ``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
        FSDP ``use_orig_params=True`` mode.
        zmodule\.2.*\.1.*Fr:  NTr  )rD  compiler@   rD   r@  rJ  rE   rJ   rL   r   r   r  )r}   r  r   r  r   req_grad_pattern	ddp_modelr  s           rH   rb   zNonUniformReqGradNWM.initI  s    ( :&9::\111,!1+	  I !99)EUVVV|555" -!1+	 
  J  >#>>>']];77
 99*FVWWWHHHIIIrG   rc   r;  )rA   rB   rC   r   r!  r"  rJ   r   rh   rJ  r@   r   r   r   r   rb   r#  r$  s   @rH   r@  r@    s        (
 (
 (
 )	(

 (
 (
 (
 (
 (
 (
T ( ( ( \(
 
 15#+J +J +J$+J )+J d38n-	+J
 +J +J +J \+J +J +J +J +JrG   r@  c                        e Zd ZdZdej        dedef fdZd Zd Z	d Z
d	 Zed
ee         dedededef
d            Z xZS )ModuleWithDelayzThis class wraps a :class:`FSDPTestModel` to optionally add a delay
    after computing the loss and/or before the gradient reduction.r.  delay_after_loss_msdelay_before_reduction_msc                 r    t                                                       || _        || _        || _        d S rz   )r   r   rQ  rR  r.  )rS   r.  rQ  rR  r   s       rH   r   zModuleWithDelay.__init__|  s6     	#6 )B&rG   c                 6    | j                             |          S rz   )r.  rU   rR   s     rH   rU   zModuleWithDelay.get_input  s    {$$V,,,rG   c                 ,    |                      |          S rz   r3  r4  s     rH   r  zModuleWithDelay.forward  r5  rG   c                 6   | j                             ||          }| j        dk    rrt          st          rt          j        | j        dz             nGt          r@t          j	        
                    t          | j        t                      z                       |S Nr     )r.  rZ   rQ  r3   r4   timesleepr2   rf   r7   _sleepr   r1   r7  s       rH   rZ   zModuleWithDelay.get_loss  s    {##E622#a'' W8 W
43d:;;;; W
!!#d&>ARATAT&T"U"UVVVrG   c                      t           j        j         fd}t          j        d|          5   j                            |           d d d            d S # 1 swxY w Y   d S )Nc                     j         dk    rrt          rAt          j                            t          j         t                      z                       n*t          st          rt          j
        j         dz              | i |S rW  )rR  r2   rf   r7   r[  r   r1   r3   r4   rY  rZ  )r_   r`   orig_reduce_scatterrS   s     rH   _delayed_reduce_scatterz=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatter  s    -11 FJ%%D:=N=P=PPQQ     F FJt=DEEE&&7777rG   z'torch.distributed.reduce_scatter_tensor)rf   distributedreduce_scatter_tensorr   patchr.  r^   )rS   r]   r_  r^  s   `  @rH   r^   zModuleWithDelay.run_backward  s    #/E	8 	8 	8 	8 	8 	8 Z57N
 
 	+ 	+ K$$T***	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+s   AAAmodule_class
model_argsmodel_kwargsc                :    t           | j        |i |||          S )aA  
        Args:
            module_class (Type[FSDPTestModel]): Wrapped module class to which
                to add delays.
            model_args: Positional arguments forwarded to the ``module_class``
                ``init()``.
            delay_after_loss_ms (int): Delay after computing the loss/before
                the optimizer step (in ms).
            delay_before_reduction_ms (int): Delay before reduce-scattering
                gradients (in ms).
            model_kwargs: Keyword arguments forwarded to the ``module_class``
                ``init()``.
        )rP  rb   )rc  rQ  rR  rd  re  s        rH   rb   zModuleWithDelay.init  s1    * Lz:\::%
 
 	
rG   )rA   rB   rC   rd   ri   rj   r   r   rU   r  rZ   r^   rh   typerO   r   rb   r#  r$  s   @rH   rP  rP  x  s        F F			 !	 $'		 	 	 	 	 	- - -    + + +$ 
=)

 !
 $'	

 
 
 
 \
 
 
 
 
rG   rP  c                   ~    e Zd Zeej        ddddfdej        dedede	e
eef                  ded	ed
efd            ZdS )NestedWrappedModuleWithDelayNFr   r}   r  r   r  r   rQ  rR  c           
      P    t                               t          | ||||||          S )Nr}   r  r   r  r   rQ  rR  )rP  rb   r&  rk  s          rH   rb   z!NestedWrappedModuleWithDelay.init  s9     ##)-#' 3&? $ 	
 	
 		
rG   )rA   rB   rC   rh   rJ   rL   r   r!  r@   r   r   r   r   r"  r   rb   rF   rG   rH   ri  ri    s         ,:+F04##$)*
 
 
$
 )
 d38n-	

 
 !
 $'
 
 
 \
 
 
rG   ri  c                   $     e Zd Z fdZd Z xZS )DummyDDPc                 V    t                                                       || _        d S rz   )r   r   r.  )rS   r.  r   s     rH   r   zDummyDDP.__init__  s$    rG   c                      | j         |i |S rz   r3  rS   r_   r`   s      rH   r  zDummyDDP.forward  s    t{D+F+++rG   rA   rB   rC   r   r  r#  r$  s   @rH   rm  rm    sG            , , , , , , ,rG   rm  c                        e Zd Zdej        dedededef
 fdZd Z	d Z
e	 	 	 ddej        dededeeeef                  dedefd            Z xZS )MixtureOfExpertsr}   r   r   delay_before_free_msr   c                    t                                          ||||           || _        || _        || _        |t
          j        k    | _        |rt          j	        d| j
        z              d}d}d}	t          t          j        ||          | j                  }
t          d |
                                D                       | _        |
                                D ]	}d|_        
|rt          j	        d           t          t          j        ||          | j                  }|rNt          j                            |
                                g          }t)          |
|fi |}
t)          ||fi |}t          j        t          t          j        |	|          | j                  ||
t          t          j        ||	          | j                            | _        d S )	N)r}   r   r   r   *   r   r   r   c              3   >   K   | ]}|                                 V  d S rz   )numel)rr   rI  s     rH   	<genexpr>z,MixtureOfExperts.__init__.<locals>.<genexpr>  s*      $L$L1QWWYY$L$L$L$L$L$LrG   Tr   )r   r   r}   rt  r   rJ   rK   r   rf   r   r   r   ri   r   r  r   num_expert_paramsexpertr`  	new_groupr   r-  r.  )rS   r}   r   r   rt  r   r  d_expertd_sharedd_inputr{  rI  sharedexpert_groupr   s                 rH   r   zMixtureOfExperts.__init__  s    	-'	 	 	
 	
 	
 
$8!"..2NN 	.b49n--- 8X!>!>@STT!$$L$L8I8I8K8K$L$L$L!L!L""$$ 	 	AAHH 	!a    8X!>!>@STT 	8 ,66 L &,>>+>>F&%77;77FmBIgx88$:MNNBIh88$:MNN	
 
rG   c                 P     j         dk    r j        d         }t          |t                    rct          j        j        j        j         fd}t          j
        d|          5                       |          cd d d            S # 1 swxY w Y                        |          S )Nr   r   c                      t           rAt          j                            t	          j        t                      z                       n*t          st          rt          j
        j        dz              | i |S )NrX  )r2   rf   r7   r[  r   rt  r1   r3   r4   rY  rZ  )r_   r`   orig_reshardrS   s     rH   _delayed_reshardz2MixtureOfExperts.forward.<locals>._delayed_reshard   s      E
)) 9<M<O<O OPP    " EX E
4#<t#CDDD'<8888rG   z.torch.distributed.fsdp._runtime_utils._reshard)rt  r.  r  r   rf   r`  fsdp_runtime_utils_reshardr   rb  )rS   r  r{  r  r  s   `   @rH   r  zMixtureOfExperts.forward  s    $q(([^F&$'' *$05DM9 9 9 9 9 9 ZDFV  * *  ;;q>>* * * * * * * * * * * * * * * *
 {{1~~s   &BBBc                    |                                  | j        st          j                    5  |                                 D ]d}t          |d          r|j        J|j                            | j                   t          j	        
                    |j        | j                   e	 d d d            d S # 1 swxY w Y   d S d S )Nr{  r|   )r  r   rf   r   r   hasattrgraddiv_r   r`  
all_reducer}   )rS   r]   rI  s      rH   r^   zMixtureOfExperts.run_backward2  s   ~ 	O O O** O OAq(++ ! v)DO444)44QV4:4NNNOO O O O O O O O O O O O O O O O O O	O 	Os   A:B77B;>B;NFr   r  r  c                    |i }|t           j        k    rt          | d|||          S |t           j        k    r>t          | fd|||d|}|t          j        k    r|                    t                    }|S t          d|           )a  
        Initializes a :class:`MixtureOfExperts` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP, including the expert and shared layers, but
                not the top-level module. The model may later be wrapped with a
                top-level FSDP external to this method if desired.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            delay_before_free_ms (int): Delay before resharding expert
                parameters in the forward pass (in ms).
        NF)r   r   rt  r   Tr  )	r@   rD   rs  rE   rJ   rL   r   r   r  )r}   r  r   r  r   rt  r  s          rH   rb   zMixtureOfExperts.init>  s    4 K\111#!1%9+    |555)!1%9+   J  >#>>>']];77
HHHIIIrG   )NFr   )rA   rB   rC   r   r!  r"  rJ   r   r   r  r^   rh   r@   r   r   r   r   rb   r#  r$  s   @rH   rs  rs    s       2
 2
 2
 )	2

 "2
 2
 2
 2
 2
 2
 2
h  0
O 
O 
O 
 15#$%0J 0J 0J$0J )0J d38n-	0J
 0J "0J 0J 0J \0J 0J 0J 0J 0JrG   rs  c                        e Zd Z	 ddddddedeej                 ded	ed
ef
 fdZdej	        dej	        fdZ
d Z xZS )MLPNTFr6   )biaswith_bufferdim_multiplierdimrT   r  r  r  c                4   t                                                       t          j        |||z  ||          | _        t          j        ||z  |||          | _        |r-|                     dt          j        |f|                     d S d | _	        d S )N)rT   r  rx   r   )
r   r   ri   r   in_projout_projr   rf   randnrx   )rS   r  rT   r  r  r  r   s         rH   r   zMLP.__init__s  s     	yns&:6PTUUU	.3"6FQUVVV 	  5;vf+M+M+MNNNNNDKKKrG   r  rP   c                     |                      |          }t          j        |          }|                     |          }t          j        |          }| j        
|| j        z   }|S rz   )r  Frelur  rx   )rS   r  zs      rH   r  zMLP.forward  sT    LLOOF1IIMM!F1II;"DKArG   c                 j    | j         +t          j        j                            | j                    d S d S rz   )rx   rf   ri   rb   normal_r   s    rH   reset_parameterszMLP.reset_parameters  s2    ;"HM!!$+..... #"rG   rz   )rA   rB   rC   r   r   rf   rT   r"  r   rg   r  r  r#  r$  s   @rH   r  r  r  s         *.
 !   &
        " %,    / / / / / / /rG   r  c                   F     e Zd Zdddedef fdZdededed	d fd
Z xZS )MLPStackF)with_seq_parallelmlp_dimr  c                    t          |d          t          |          t          |d          g}|r)|                    t          j        |d                      t	                      j        |  || _        d S )N   )r  Fr  )r  appendri   	LayerNormr   r   r  )rS   r  r  modulesr   s       rH   r   zMLPStack.__init__  s     ***LL***	$
  	>NN2<e<<<==='""!2rG   tp_meshdp_meshuse_activation_checkpointingrP   c           
         t          d          t          d          t          d          t          d          t          d          | j        rt          t          d                    nt                      d}| j        rt	          d          |d<   t          | ||           | D ]=}t          |t          j                  r|rt          |           t          |fd	|i| >t          | fd	|i| | S )
NF)use_local_outputr>   )output_layouts)z	0.in_projz
0.out_projz	1.in_projz
1.out_projz	2.in_projz
2.out_proj)sequence_dim3)device_meshparallelize_planmesh)r%   r'   r  r$   r(   r&   r  ri   r  r   r   )rS   r  r  r  r  r  r.  s          rH   parallelizezMLPStack.parallelize  s5    )%@@@)5AAA(%@@@)5AAA(%@@@%#/qBBBB ""
 
 ! 	E$4!$D$D$DS!4WGWXXXX 	= 	=F&",// + #6"""<<W<<<<<D66w6+666rG   )	rA   rB   rC   r   r"  r   r   r  r#  r$  s   @rH   r  r    s        BG 
3 
3 
3 
34 
3 
3 
3 
3 
3 
3  '+	 
       rG   r  c                        e Zd ZdZd	dedef fdZdej        de	e
ej        ej        f         ej        f         fdZ xZS )
DoubleLinearz
    This can be used for returning multiple outputs from a module
    (``use_second_linear=True``) or for having an unused module (``False``).
    Tr  use_second_linearc                     t                                                       t          j        ||          | _        t          j        ||          | _        t          j                    | _        || _        d S rz   )	r   r   ri   r   lin1lin2ReLUr  r  )rS   r  r  r   s      rH   r   zDoubleLinear.__init__  sZ    Ic3''	Ic3''	GII	!2rG   r  rP   c                     | j         rP|                     |                     |                    |                     |                     |                    fS |                     |                     |                    S rz   )r  r  r  r  r4  s     rH   r  zDoubleLinear.forward  sc     ! 	D99TYYq\\**DIIdiill,C,CCCyy1&&&rG   T)rA   rB   rC   rd   r   r"  r   rf   rg   r   re   r  r#  r$  s   @rH   r  r    s         
3 3C 3D 3 3 3 3 3 3''	uU\5</0%,>	?' ' ' ' ' ' ' 'rG   r  new_all_gather_into_tensorc              #      K   t           j        }t          j                     | t           _        	 d V  t          j                     |t           _        d S # t          j                     |t           _        w xY wrz   )r   all_gather_into_tensorbarrier)r  orig_all_gathers     rH   patch_all_gatherr    sj      1OLNNN"<D6&5### 	&5#5555   A !A5new_reduce_scatter_tensorc              #      K   t           j        }t          j                     | t           _        	 d V  t          j                     |t           _        d S # t          j                     |t           _        w xY wrz   )r   ra  r  )r  r^  s     rH   patch_reduce_scatterr    sk      4LNNN!:D9%8""" 	%8"8888r  new_all_reducec              #      K   t           j        }t          j                     | t           _        	 d V  t          j                     |t           _        d S # t          j                     |t           _        w xY wrz   )r   r  r  )r  orig_all_reduces     rH   patch_all_reducer    sd      oOLNNN$DO*) 	)))))r  new_unshardc              #      K   t           j        }t          j                     | t           _        	 d V  t          j                     |t           _        d S # t          j                     |t           _        w xY wrz   )r   unshardr   r  )r  orig_unshards     rH   patch_unshardr    l       ")LLNNN(N.!- 	!-----r  new_reshardc              #      K   t           j        }t          j                     | t           _        	 d V  t          j                     |t           _        d S # t          j                     |t           _        w xY wrz   )r   reshardr   r  )r  r  s     rH   patch_reshardr    r  r  new_post_backwardc              #      K   t           j        }t          j                     | t           _        	 d V  t          j                     |t           _        d S # t          j                     |t           _        w xY wrz   )r   post_backwardr   r  )r  orig_post_backwards     rH   patch_post_backwardr    sm       (5LNNN#4N :'9$$$ 	'9$9999r  new_backwardc              #      K   t           j        }t          j                     | t           _        	 d V  t          j                     |t           _        d S # t          j                     |t           _        w xY wrz   )r   r  r   r  )r  orig_backwards     rH   *patch_register_post_backward_hook_backwardr  %  so       19MLNNN,8 )>0=$--- 	0=$-====r  r^  r_   r`   c                     t          |          dk    r	|d         }n"d|v r	|d         }nt          d| d|            ||            ||i |S )Nr   rY   z,Cannot get reduce-scatter output from
args: z	
kwargs: )lenAssertionError)clsr^  rm   r_   r`   rY   s         rH   reduce_scatter_with_assertr  2  s     4yy1}}a	V		!TDTTFTT
 
 	
 If////rG   rF   replicated_modulesharded_moduleprefixes_to_ignore.c                    t          |                                |                                          D ]\  \  }}\  }}|}|D ]}	|                    |	d          }|                     ||           |                     |t
                     t          |t
                    sJ |j        |j        }}
t          |          t          d          t          d          fk    rt          d          t          ||
|          }|                     |                                |                                           |j        |                     |j                   +|                     |j                   t          |j        |
|          }|                     |j        t
                     t          |j        t
                    sJ |                     |j                                        |                                           d S )N r   zmFSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), so we cannot check for equality using it)r   r~   replaceassertEqualassertIsInstancer#   r  r  
placementsre   r$   r  r"   to_localr  assertIsNoneassertIsNotNone)r  r  r  r  replicated_namereplicated_paramsharded_namesharded_paramclean_sharded_nameprefixr  r  sharded_ref_paramsharded_ref_grads                 rH   check_sharded_parityr  E  s    OR**,,n.M.M.O.OO O T TJ+*-JlM *( 	H 	HF!3!;!;FB!G!G);<<<]G444-11111(4m6Njq588 444 ;   ..>jQQ..002C2L2L2N2NOOO (]/000M.///,-=-BD*UU]/999-,g66666*33557G7P7P7R7RSSSS1T TrG   znot-support-multithreadc                   F     e Zd Zed             Z fdZd Zd Zd Z xZ	S )FSDPTestMultiThreadc                     t           S rz   DEVICE_COUNTr   s    rH   r   zFSDPTestMultiThread.world_sizeh      rG   c                 p    t                                                       |                                  d S rz   )r   setUp_spawn_threadsrS   r   s    rH   r  zFSDPTestMultiThread.setUpl  s,    rG   c                 "    t          | g|R i |S rz   r.   rp  s      rH   r.   z FSDPTestMultiThread.run_subtestsp       D242226222rG   c                 B    t           j                                         d S rz   rf   _dynamoresetr   s    rH   perThreadSetUpz"FSDPTestMultiThread.perThreadSetUps      rG   c                 B    t           j                                         d S rz   r  r   s    rH   perThreadTearDownz%FSDPTestMultiThread.perThreadTearDownv  r  rG   )
rA   rB   rC   propertyr   r  r.   r  r  r#  r$  s   @rH   r  r  f  s~          X    3 3 3        rG   r  c            $           e Zd Z fdZed             Zed             Zedefd            Zed             Z	d Z
d Zd	 Zd
 Zed             Z	 	 	 	 	 	 	 d(dej        dedededee         dedee         dededeeeef                  fdZddd e            dddddddddfdee         dededee         d eded!ed"ee          d#ee!         dee         d$ed%ededed&eeeef                  deeeef                  f d'Z" xZ#S ))FSDPTestc                     t                                                       dt          j        d<   |                                  d S )N0TORCH_NCCL_DESYNC_DEBUG)r   r  osenviron_spawn_processesr  s    rH   r  zFSDPTest.setUp{  s;     14
,-rG   c                     t           S rz   r  r   s    rH   r   zFSDPTest.world_size  r  rG   c                 >    t           j                                        S rz   )r   distributed_c10d_get_default_groupr   s    rH   rl   zFSDPTest.process_group  s    $77999rG   rP   c                     dS r;  rF   r   s    rH   destroy_pg_upon_exitzFSDPTest.destroy_pg_upon_exit  s	     urG   c                 "    t            | j         S rz   )r0   	file_namer   s    rH   init_methodzFSDPTest.init_method  s    /t~///rG   c                 <    |                      ||j                   d S rz   )r  r   )rS   r  r   s      rH   _check_cpu_offloadzFSDPTest._check_cpu_offload  s!    j&<=====rG   c                 <    |                      ||j                   d S rz   )r  backward_prefetch)rS   r  r$  s      rH   _check_backward_prefetchz!FSDPTest._check_backward_prefetch  s"    *J,HIIIIIrG   c                 <    |                      ||j                   d S rz   )r  forward_prefetch)rS   r  r'  s      rH   _check_forward_prefetchz FSDPTest._check_forward_prefetch  s"    ):+FGGGGGrG   c                 "    t          | g|R i |S rz   r  rp  s      rH   r.   zFSDPTest.run_subtests  r  rG   c                     | |          }||_         ||_        |                    dd          }t          d|j          d|j                    t
          j                                        |j        k     r,t          j	        t          d|j                  j                   	 |rKt
          j        j        j        j                                        }t#          j        d|j        ||           n9t#          j        |j        t(          t+          |j                  |j                    nF# t,          $ r9}	d	|	j        d
         v r$t          j	        t          d         j                    d }	~	ww xY wd }
|j         t0          z  }t2          st4          rt
          j                            |           |g}
t#          j        |
           t
          j                                         |                    ||           t
          j                                         t#          j        |
           t#          j                      d S )Nfake_pgFzdist init r=z, world=z
multi-gpu-fake)backendr   r   store)r   r-  r   r   	recompiler   backend_unavailable)
device_ids)!r   r  getprintr   rf   acceleratordevice_countsysexitr/   	exit_codetesting	_internalr`  r+  	FakeStorer   init_process_groupr   DISTRIBUTED_BACKENDr   RuntimeErrorr_   r  r2   r4   set_device_indexr  r	  r
  run_testdestroy_process_group)r  r   	test_namer  piper`   rS   r+  r.  er1  	device_ids               rH   _runzFSDPTest._run  s'   s9~~	"**Y..ATYAAAABBB))++do==HZ >T_ > >?IJJJ	 /;CMMOO'"#	     ' $ 0/"4?33	     	 	 	afQi''$9:DEEE		 
I,	 	: 	:..y999[

 	
++++i&&&
++++"$$$$$s   #BD* *
E-44E((E-{Gz?NFrk   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc           	         |o|j         }t          |                                          j        }|
i }
t	          d	d|i|
}t
          j                            |                                |d          }t          |          D ]7}|	                                 t
          j
                            t          |          5  |j                            t          j        t                              }|	s|r]t          |t                     sHt          |t
          j                  r|                                }nt'          d |D                       } || }|rgt          |t                     rR|j        t*          vrD|                                D ]/}|                     |j        t          j        d                     0|j                            ||                              |          }d d d            n# 1 swxY w Y   |                    |          }|s"|	s |j        t
          j        k    s
J d            n|	r&|                     |j        t
          j                   n_t          |t                     r%|J |                     |j        |j                   n%|                     |j        t
          j                   |j                            |           |rYt          |t                     rD|                                D ]/}|                     |j        t          j        d                     0|                    |           |                                  |rTd |!                                "                                D             }tG          |           |$                    |           9t          |t                     r|%                    tL          j'                   |(                                S )
Nenabledg?)rJ  momentum)rR  c              3   >   K   | ]}|                                 V  d S rz   )r   )rr   r  s     rH   ry  z4FSDPTest._train_for_several_steps.<locals>.<genexpr>  s*      %>%>1affhh%>%>%>%>%>%>rG   r<   zeloss data type should be float32, as the original                     parameter data type is float32.c                 >    i | ]\  }}||                                 S rF   )clone)rr   kvs      rH   
<dictcomp>z5FSDPTest._train_for_several_steps.<locals>.<dictcomp>#  s&    RRRtq!aRRRrG   rF   ))offload_paramsnextr   rT   r   rf   optimSGDr   	zero_gradamprI  r   r.  rU   r  r   rg   r   re   r  r   r  rZ   r   scaler   float32float16param_dtyper^   stepupdater   r   r   load_state_dict_assert_stater   IDLErq   )rS   rk   rH  rI  rJ  rK  rL  rM  rN  rO  rP  cpu_offload_paramsmodel_devicesharded_grad_scalerr\  r{   rX   rY   rI  r]   r   s                        rH   _train_for_several_stepsz!FSDPTest._train_for_several_steps  sF    .Q2B2QE,,..//6%-)+&/ 
 
.
2L
 

  0 0 2 2rCHHy!! 9	2 9	2AOO##K#BB M M..u|K/H/HII  ?_ ?Zt=T=T ?!%66 ? %

 %%>%>%>%>%> > > '
H"5$//
H
 />? ? #--// H H((5<3F3FGGGG|,,UF;;>>|LL-M M M M M M M M M M M M M M M. ',,T22D" @= @zU]2225 3222
 ! @$$TZ????t,, @*666$$TZ1LMMMM$$TZ???L%%d+++! Dj&=&= D))++ D DA$$QXu|E/B/BCCCC$$U+++&&((( 2RRu7G7G7I7I7O7O7Q7QRRR
 E"""%%j111eT"" 	4 2333{{}}s   D/G==H	H	r   Tmodel_classr  r   ref_init_fn	num_itersr   r$  r  r'  use_orig_paramsinit_kwargsc                    |t           j        k    s
J d            |i }d}| j                                        } |j        | j        t           j        t
          j        fddi|}|9t          rt          |t          gt                    }nt          ||g|          }n ||          }|r|
                                }|                     |||
du|||
|||	  	        }t          |                                          }|                    |||	|
||d           	  |j        | j        |||fddi|}n5# t          $ r(}t!          d	| d
t#          |                     |d}~ww xY wt%          |t&                    st'          || j        fi |}|r|
                                }|t
          j        k    r|                    t                    }|duo|j        }|o|t
          j        k    }|o|t
          j        k    }|rFt/          j        d          }|                                D ]}|                     |j        |           |r#|                     t6          dt                     nt9                      }|5  |                     ||d||||
|||
  
        } ddd           n# 1 swxY w Y   |rdS |r`t/          j        d          }|                                D ]}|                     |j        |           |                     t                    } t;          |          }!t.          j                            || d           |
|s|                     ||!dd           dS dS dS )a  
        Tests FSDP training against a reference, which defaults to DDP but
        may be customized with ``ref_init_fn``.

        Args:
            model_class (Type[FSDPTestModel]): A model class that inherits from
                ``FSDPTestModel``, which defines the expected interface.
            fsdp_init_mode (FSDPInitMode): The mode to initialize the
                FSDP-wrapped model. This should not be ``NO_FSDP``.
            ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
                non-wrapped model to construct the reference model, where this
                wrapper should provide data parallel semantics. If ``None``,
                then the callable defaults to the DDP constructor.
        z.Expects an FSDP init mode that wraps with FSDPNrG  r   T)r1  output_device)rI  rJ  rK  rM  rN  rO  rP  )r   r$  r  rM  r'  rp  zInitializing z raised error r<   zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on F)rI  rJ  rK  rL  rM  rN  rO  rP  )check_dtypezFSDP did not match DDP)exact_devicemsg) r@   rD   rl   r   rb   rJ   rK   r3   DDPr   r   rl  r   r   re  	Exceptionr  r   r  r   rL   r   rZ  rf   rT   r  assertRaisesRegexr>  r   r   r9  assert_close)"rS   rm  r  r   rn  ro  rL  r   r$  r  rM  r'  rp  rN  rO  rq  rP  r  rJ  r   rk   	ref_modelref_loss
ddp_paramsr  rD  rZ  expects_device_errorexpects_cpu_device
cpu_devicert   context	fsdp_lossfsdp_unsharded_paramss"                                     rH   _test_fsdp_parityzFSDPTest._test_fsdp_parity-  s   D !5555< 655 K!&&((   (
 
 	

 
 
  N{m;  		  4&MMM		#E**I 	)!((I00$D0(+'A''A 1 

 

 )..0011
*%6%6#2$4#2 		
 		
 		

	Y))" 	 
 #  JJ  	Y 	Y 	YP[PPAPPQQWXX	Y*d++ 	M j$*<LLLLJ 	+#**J~:::#{33J$D0O[5O
 N/>3NN 	 N/>3NN 	  	;e,,J#..00 ; ;  z:::: $D""3%03 3    	  	 	55!,% /+E++E 6  I	 	 	 	 	 	 	 	 	 	 	 	 	 	 	   	F  	2e,,J#..00 ; ;  z::::![11I /
 ; ; 	""8YE"JJJ "="%!,	       #"""s*   D8 8
E*#E%%E* J--J14J1)rG  NFNFFN)$rA   rB   rC   r  r  r   rl   r"  r  r   r"  r%  r(  r.   classmethodrF  ri   rj   r   floatr   r   r   r   r   r   rl  rg  rO   r@   rJ   r   r   r   r  r#  r$  s   @rH   r  r  z  s                   X : : X: d    X 0 0 X0> > >J J JH H H3 3 3 3% 3% [3%t 15 48+0#?CU UyU U 	U
 U #:.U U ".1U %)U U %-T#s(^$<U U U Ux +/",*,,8<8<48!& %+0#04?C#g g-(g %g )	g
 h'g g g  g $$45g $$45g ".1g g g %)g g  d38n-!g" %-T#s(^$<#g g g g g g g grG   r  compile_compute_on_modulec                 H      fd G d dt                     fd}|S )Nc                      t          j        j        j        | i | t	          | d                   r| d                                          d S d S )Nr   )rf   r`  r  r   r  rL  )r_   r`   r  s     rH   !fully_shard_with_compiled_computez=compiled_fsdp_test.<locals>.fully_shard_with_compiled_compute  sb    *D;F;;;$,
G.1
 1
, GOO -,rG   c                   6    e Zd Z e            Z e            ZdS )*compiled_fsdp_test.<locals>.FullyShardModeN)rA   rB   rC   r   EAGERCOMPILED_COMPUTErF   rG   rH   FullyShardModer    s'        466rG   r  c                 B     t                      fd            }|S )Nc                     t           j        j        j        }D ]E}|j        k    r#t                      st          j        d           1t           j        j	        j
        }t           j        j	        j        }t           j                                         |j        k    r|}nL|j        k    r/dt           j        j	        _
        dt           j        j	        _        }nt          d|           |	j        |j        <    	| i | t           j                                         |	j        |j        <   |t           j        j	        _
        |t           j        j	        _        Gd S )Nz0Inductor on GPU needs Triton and recent GPU archTr>   z!Need to implement FullyShardMode=)rf   r`  r  r   r  r5   warningswarnr	  configskip_fsdp_hooks	_inductorcompile_threadsr  r  NotImplementedError__globals__rA   )
r_   r`   original_fully_shardmodeoriginal_skip_fsdp_hooksoriginal_compile_threadsfully_shard_patchr  r  funcs
          rH   wrapperz6compiled_fsdp_test.<locals>.decorator.<locals>.wrapper  s]   (-(9(>(J & R R>///
/M"TUUU+0=+?+O(+0?+A+Q(!))+++>///(<%%^<<<;?EM(8=>EO*:(I%%-BDBB   CT !5!>?d%f%%%!))+++BV !5!>?7O$49Q&669R RrG   r	   )r  r  r  r  s   ` rH   	decoratorz%compiled_fsdp_test.<locals>.decorator  sF    	t	R 	R 	R 	R 	R 	R 
	R@ rG   )r   )r  r  r  r  s   ` @@rH   compiled_fsdp_testr    so        " " " " " " " "" " " " " "H rG   c                   &     e Zd Zd fdZd Z xZS )
SkipModulerP   Nc                     t                                                       t          j        ddd          | _        d S N
   Fr  )r   r   ri   r   linr  s    rH   r   zSkipModule.__init__  s5    9R%000rG   c                 ,    |                      |          S rz   )r  r4  s     rH   r  zSkipModule.forward  s    xx{{rG   rc   rq  r$  s   @rH   r  r  
  sL        1 1 1 1 1 1      rG   r  c                   $     e Zd Z fdZd Z xZS )NestedLinearc                 :   t                                                       |rCt          t          j        ddd                              t                              | _        d S t          j        ddd                              t                    | _        d S r  )r   r   r!   ri   r   r   r   nested_linear)rS   	fsdp_wrapr   s     rH   r   zNestedLinear.__init__  s     	O!%biBU&C&C&C&F&F{&S&S!T!TD!#2r!>!>!>!A!A+!N!NDrG   c                 ,    |                      |          S rz   )r  r4  s     rH   r  zNestedLinear.forward  s    !!!$$$rG   rq  r$  s   @rH   r  r    sL        O O O O O% % % % % % %rG   r  c                   $     e Zd Z fdZd Z xZS )	SkipModelc                 X   t                                                       t          j        ddd                              t
                    | _        t                                          t
                    | _        t          t          |          t
                    | _        d S )Nr  Fr  )r  )rE  )r   r   ri   r   r   r   linearr  linear_skipr!   r  r  )rS   double_nestr   s     rH   r   zSkipModel.__init__   s    iBU33366{CC%<<??;77!;///;
 
 
rG   c                     |                      |          }|                     |          }|                     |          }|S rz   )r  r  r  r4  s     rH   r  zSkipModel.forward(  s<    KKNNQq!!rG   rq  r$  s   @rH   r  r    sG        
 
 
 
 
      rG   r  )FT)FFr  )rF   rz   )
contextlibr  rD  r6  rY  unittestr  abcr   r   r   copyr   enumr   r   	functoolsr
   typingr   r   r   r   r   r   r   rf   torch.distributedr`  r   torch.nnri   torch.nn.functionalr	  r  torch.distributed._composabler   torch.distributed.device_meshr   torch.distributed.fsdpr   r   r   r   $torch.distributed.fsdp._common_utilsr   5torch.distributed.fsdp._fully_shard._fsdp_param_groupr   r   "torch.distributed.fsdp._init_utilsr   2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r   *torch.distributed.fsdp.sharded_grad_scalerr   torch.distributed.fsdp.wrapr   r    r!   torch.distributed.tensorr"   r#   r$   !torch.distributed.tensor.parallelr%   r&   r'   r(   r)   r*   torch.nn.parallel.distributedr+   rw  *torch.testing._internal.common_distributedr,   r-   r.   r/   $torch.testing._internal.common_utilsr0   r1   r2   r3   r4   torch.utils._tritonr5   r  r   r=  r7   r5  r:   r@   rJ   rj   rO   r!  r   r   r"  r   r   r   r   r   r   r   r   r   r&  r=  r@  rP  ri  rm  rs  r  r-  r  r  contextmanagerr  r  r  r  r  r  r  r  re   r   r  skipIfr  r  rg  r  r  r  r  rF   rG   rH   <module>r     s
       				 				 



    # # # # # # # # " " " " " "                     F F F F F F F F F F F F F F F F                                   4 4 4 4 4 4 4 4 4 4 4 4         
 ? > > > > >        S R R R R R         
 I H H H H H R R R R R R R R R R F F F F F F F F F F            F E E E E E E E H H H H H H                         + * * * * *  K :**,,LL 
K  K 9))++LLK L    4       T       BIs   49$    >% % % # #9## # # #"       "2 229 2t 2 2 2 2>29 >d > > > >DBI D$ D D D D       .Q" Q" Q" Q" Q"- Q" Q" Q"h[J [J [J [J [J- [J [J [J|    $7   D]J ]J ]J ]J ]J. ]J ]J ]J@J
 J
 J
 J
 J
m J
 J
 J
Z
 
 
 
 
? 
 
 
., , , , ,ry , , ,JJ JJ JJ JJ JJ* JJ JJ JJZ/ / / / /") / / /@* * * * *r} * * *Z' ' ' ' '29 ' ' '6 6 6 6 6 6 9H 9 9 9 9 *X * * * * .x . . .  . .x . . .  . :8 : : :  : >X > > >  >0!0 0 	0
 0 0 0 0. +-	T TyT IT c3h	T T T TB 455    /   65&Z Z Z Z Z# Z Z Zz
0 0(4. 0 0 0 0f       	% 	% 	% 	% 	%29 	% 	% 	%    	     rG   