
    Pi(                         d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZmZ d dlmZ d dlmZmZ d dlmZ d	 Z G d
 de          Z G d de
          Z G d de	          ZdS )    N)DTensor)
DeviceMesh)ColwiseParallelPrepareModuleInputRowwiseParallel)ScalingType
e4m3_dtype)tensor_already_casted_to_fp8)NoopFwToFloat8BwDynamichp_tensor_to_float8_dynamic)GemmInputRolec                 V    | j         t          j        k    o| j        t          j        k    S )N)scaling_type_inputr   DYNAMICscaling_type_grad_output)ms    y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/float8/float8_tensor_parallel.py(_float8_linear_supports_float8_allgatherr   "   s*     	
 33 	>&+*==    c                   p     e Zd ZdZed             Zed             Zdej        de	dej        f fdZ
 xZS )Float8ColwiseParallelzo
    Like `ColwiseParallel`, but with all-gather in float8. This
    currently assumes tensorwise scaling.
    c                 *   |d         }t          |t                    st          j        ||| d          }t          |          s1t	          ||j        j        j        |j        t          j
                  }| |k    r|                    |d          }|S Nr   F	run_checkgemm_input_roleT
placementsasync_op
isinstancer   
from_localr
   r   configcast_config_inputtarget_dtypelinear_mm_configr   INPUTredistributeinput_layoutsdesired_input_layoutsmodinputsdevice_meshinput_tensors         r   _prepare_input_fnz'Float8ColwiseParallel._prepare_input_fn0   s    
 ay,00 	"-k=E  L ,L99 	6
,9$ - 3	  L 111'4404 5  L r   c                     |j         | k    r|                    | d          }t          j        ||j        |j        j        j                  }|r|                                n|S NTr   	r   r)   r   applyr'   r$   cast_config_grad_outputr&   to_localoutput_layoutsuse_local_outputr-   outputsr/   s        r   _prepare_output_fnz(Float8ColwiseParallel._prepare_output_fnJ   sw     //**)D +  G
 */ J.;
 
 &6Bw!!!7Br   moduler/   returnc                    ddl m} t          ||          st          dt	          |                     t          ||          rt          |          st          d          t                                          ||          S Nr   Float8Linearz.Expecting module to be Float8Linear but found unsupported	torchao.float8.float8_linearrB   r"   
ValueErrortyper   AssertionErrorsuper_applyselfr=   r/   rB   	__class__s       r   rJ   zFloat8ColwiseParallel._apply\       ======&,// 	0OfOO   L
 
 	0:6BB	0 !///ww~~fk222r   __name__
__module____qualname____doc__staticmethodr1   r<   nnModuler   rJ   __classcell__rM   s   @r   r   r   *   s         
   \2 C C \C"3RY 3Z 3BI 3 3 3 3 3 3 3 3 3 3r   r   c                   p     e Zd ZdZed             Zed             Zdej        de	dej        f fdZ
 xZS )Float8RowwiseParallelzo
    Like `RowwiseParallel`, but with all-gather in float8. This
    currently assumes tensorwise scaling.
    c                 *   |d         }t          |t                    st          j        ||| d          }t          |          s1t	          ||j        j        j        |j        t          j
                  }| |k    r|                    |d          }|S r   r!   r*   s         r   r1   z'Float8RowwiseParallel._prepare_input_fnq   s     ay,00 	"-k=E  L ,L99 	6
,9$ - 3	  L 111'4404 5  L r   c                     |j         | k    r|                    | d          }t          j        ||j        |j        j        j                  }|r|                                n|S r3   r4   r8   s        r   r<   z(Float8RowwiseParallel._prepare_output_fn   sp    
 //**nt*TTG */ J.;
 
 &6Bw!!!7Br   r=   r/   r>   c                    ddl m} t          ||          st          dt	          |                     t          ||          rt          |          st          d          t                                          ||          S r@   rD   rK   s       r   rJ   zFloat8RowwiseParallel._apply   rN   r   rO   rX   s   @r   rZ   rZ   k   s         
   \. C C \C"3RY 3Z 3BI 3 3 3 3 3 3 3 3 3 3r   rZ   c                   p     e Zd ZdZdddddej        dd fd
Zd Zdej	        de
d	ej	        f fd
Z xZS )PrepareFloat8ModuleInputa  
    Like `PrepareModuleInput`, but with all-gather in float8. This
    currently assumes tensorwise scaling.

    The only difference from `PrepareModuleInput` is that
    after we prepare the input DTensor, we cast the input to DTensor(Float8TrainingTensor)
    This is to ensure the float8 cast happens before the all-gather (i.e. Shard -> Replicate)
    so that if there are multiple float8 users of the input activation, we perform fp8 allgather
    only once.
    FP8 Args:
      float8_dtype (torch.dtype, optional): control what float8 dtype to cast to when prepare the module input,
          we currently only support torch.float8_e4m3fn. default: torch.float8_e4m3fn
      fwd_config_submodule_fqn (str, optional): the fqn of the submodule that contains the forward config used
          for the float8 cast. If not specified, we will search for the Float8Linear in the submodules
          and use the forward config from that module, in this case all module's forward config must be
          the same.
    NF)r+   r,   input_kwarg_layoutsdesired_input_kwarg_layoutsr:   float8_dtypefwd_config_submodule_fqnc                    t                                          |||||           || _        d | _        || _        | j        t
          j        k    rt          d          d S )N)r+   r,   r`   ra   r:   zFPrepareFloat8ModuleInput only support casting to float8_e4m3fn for now)rI   __init__rb   r'   rc   torchfloat8_e4m3fnNotImplementedError)	rL   r+   r,   r`   ra   r:   rb   rc   rM   s	           r   re   z!PrepareFloat8ModuleInput.__init__   s}     	'"7 3(C- 	 	
 	
 	
 ) $(@% 333%X   43r   c                 z   |t          |t                    r|}n=t          |t          j                  s
J d            t          j        |||fd          }t          |t          | j        t          j	                  }|||k    r|
                    |f          }| j        r|                                n|S |S )Nz%expecting input to be a torch.Tensor!Fr   r   )r   )r"   r   rf   Tensorr#   r   r	   r'   r   r(   r)   r:   r7   )rL   inputmeshinput_layoutdesired_layoutdt_inps         r   _prepare_input_argz+PrepareFloat8ModuleInput._prepare_input_arg   s    #%)) 
 !%66  ; 6 !+4,E   1% - 3	  F )ln.L.L,,8I,JJ(,(=I6??$$$6ILr   r=   r/   r>   c                    ddl m} | j        9|                    | j                  }t	          ||          sJ |j        | _        nU|                                D ]@}t	          ||          r.| j        |j        | _        &| j        |j        k    s
J d            A| j        J t                                          ||           |S )Nr   rA   z?All the Float8Linear modules should have same linear_mm_config!)	rE   rB   rc   get_submoduler"   r'   modulesrI   rJ   )rL   r=   r/   rB   
fwd_linearr-   rM   s         r   rJ   zPrepareFloat8ModuleInput._apply   s    ======(4--d.KLLJj,77777$.$?D!! ~~''  c<00 ,4030D--#48LLLL]  MLL $000v{+++r   )rP   rQ   rR   rS   rf   rg   re   rp   rU   rV   r   rJ   rW   rX   s   @r   r_   r_      s         * " $((!%      :  6RY Z BI          r   r_   )rf   torch.nnrU   torch.distributed._tensorr   torch.distributed.device_meshr   !torch.distributed.tensor.parallelr   r   r   torchao.float8.configr   r	    torchao.float8.distributed_utilsr
   #torchao.float8.float8_scaling_utilsr   r   %torchao.float8.float8_training_tensorr   r   r   rZ   r_    r   r   <module>r~      s          - - - - - - 4 4 4 4 4 4          : 9 9 9 9 9 9 9 I I I I I I        @ ? ? ? ? ?  >3 >3 >3 >3 >3O >3 >3 >3B<3 <3 <3 <3 <3O <3 <3 <3~_ _ _ _ _1 _ _ _ _ _r   