
    PiFL                        U d dl mZmZmZmZ d dlZd dlmZ d dlm	Z	m
Z
 d dlmZmZ ej        j        Zej        j        Zej        j        Zi Zeeef         ed<   	 	 	 d&dej        d	ej        d
ej        dej        dej        deej                 deej                 dedej        fdZd Zd Z eej        j        ej        j        ej        j        ej        j        ej        j         ej!        j        g          d'd            Z" eej#        j        g          d'd            Z$ eej%        j        ej&        j'        g          d'd            Z( eej)        j        g          d'd            Z* eej+        j        g          d'd            Z, eej-        j        g          d'd            Z. eej/        j0        g          d'd            Z1de	de	fdZ2 eej3        j        ej4        j        g          d'd            Z5 eej6        j        g          d'd            Z7 eej8        j        g          d'd             Z9 eej:        j        g          d'd!            Z; eej<        j        ej<        j        g          d'd"            Z= eej>        j        ej>        j        g          d'd#            Z? eej@        j        g          d'd$            ZA eejB        j        g          d'd%            ZCdS )(    )AnyDictOptionalTupleN)tree_map)Float8TrainingTensorchoose_scaled_mm_config)is_row_majorpad_tensor_for_matmulFLOAT8_OPS_TABLEFa_dataa_scaleb_datab_scaleoutput_dtypeoutput_scalebiasuse_fast_accumreturnc           
      V   |                                 }|                                 }	d}
|j        | j        d         dfk    o|j        d|j        d         fk    }|r1|s/||	z  }
|                    d          }|                    d          }	|}|t          j        t          j        fv r|rt          j        }d}|t          j        k    r|}d}t          j        | |||	||||          }|
||
z  }|||z  }|t          j        t          j        fv r|r|                    |          }|S )z
    This is the unwrapped version of addmm_float8, which does not take in Float8TrainingTensors
    as inputs. This is used to standardize the logic between subclassed and non subclassed
    versions of the linear module.
    Nr       )scale_ascale_br   scale_result	out_dtyper   )	
reciprocalshapenew_onestorchfloat16float32bfloat16
_scaled_mmto)r   r   r   r   r   r   r   r   a_inverse_scaleb_inverse_scalepost_inverse_scaleis_rowwise_scaling
orig_dtype	post_biasoutputs                  m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/float8/float8_ops.pyaddmm_float8_unwrappedr.      so    ((**O((**O 6<?A*>> 7=	QU D
  7. 7 ->)22266)22266 JemU]3338J3~Iu}$$	!%	 	 	F %$$)emU]3338J3:&&M    c                 L    t          |j                  dv sJ |  d            d S )N)r   r   z+ with axiswise scaling is not supported yet)lenr   )aten_opscales     r-   _assert_tensorwise_scaler4   ^   s9     	EKF"""
>>> 	#""""r/   c                       fd}|S )z(Register aten ops to the float8 op tablec                     D ]:}|t           v r%t          d| dt           |         j                   | t           |<   ;| S )Nz
Float8 op z is already registered to )r   RuntimeError__name__)funcopaten_opss     r-   	decoratorzimplements.<locals>.decoratori   sb     	( 	(B%%%"^^^?OPR?S?\^^   $(R  r/   r   )r;   r<   s   ` r-   
implementsr=   f   s$         r/   c                     t          | |d         j                    | |d         j        g|dd          R i |}t          ||d         j        |d         j        |d         j        |d         j                  S Nr   r   )r4   _scale_datar   _orig_dtype_linear_mm_config_gemm_input_role)r2   argskwargsnew_datas       r-   float8_desugar_oprH   u   s     Wd1gn555wtAw}:tABBx:::6::HQQQ!Q   r/   c                      | |d         j         g|dd          R i |} | |d         j        g|dd          R i |}t          |||d         j        |d         j        |d         j                  S r?   )rA   r@   r   rB   rC   rD   )r2   rE   rF   rG   	new_scales        r-    float8_desugar_data_and_scale_oprK      s     wtAw}:tABBx:::6::HQ<abb<<<V<<IQQ!Q   r/   c                     | |d         j         g|dd          R i |}|d         j        j        dk    r  | |d         j        g|dd          R i |}n|d         j        }| t          j        j        k    rt          | |d         j                   |d         j        }|}||dk    r|dk     n|dk     t          |||d         j	        |d         j
        |d         j        |          S )Nr   r   )rA   r@   ndimaten	transposeintr4   _axiswise_dimr   rB   rC   rD   )r2   rE   rF   rG   rJ   old_axiswise_dimnew_axiswise_dims          r-   float8_transposerU      s    wtAw}:tABBx:::6::HAw~QGDGN@T!""X@@@@@		GN	$.$$$ $q'.999Aw,'#q  """!!QQ!Q   r/   c                    |d         |d         }}|t          |j        j                  k    rj | |d         j        g|dd          R i |}t          ||d         j        |d         j        |d         j        |d         j        |d         j                  S t          |d         j        j                  dk     rt          | ||          S |j        }t          |          dk    r|dk    rP | |j        |fi |}d|d         g} | |j        |fi |}t          |||j        |j        |j        |j                  S |dk    s|t          |j                  dz
  k    rM | |j        |fi |}|d         dg} | |j        |fi |}d}	t          |||j        |j        |j        |	          S t          |  d|j         d|j        j         d|j         d| d	
          )
Nr   r      rM   z# with axiswise scaling and t.shape z t._scale.shape z t._axiswise_dim z new_shape z is not supported yet.)listrA   r   r   r@   rB   rC   rD   rR   r1   rH   AssertionError)
r2   rE   rF   t	new_shaperG   axiswise_dimnew_scale_shaperJ   rT   s
             r-   float8_viewr^      sw   7DGyA D''''747=>48>>>v>>#GNGG%G$G!
 
 	
 47>  1$$ $777 ?L
9~~1wqw	<<V<<H )B-0O/DDVDDI'#"   R<CLL14D#E#Ewqw	<<V<<H(|Q/O/DDVDDI!'#"      	o  	oqw  	o  	oPQPXP^  	o  	oqr  rA  	o  	o  NW  	o  	o  	o  r/   c                      | d         j         gdd          R i |}t          | d         j                   fd}t          ||          }t	          |          S )Nr   r   c                     t          | d         j        d         j        d         j        d         j                  S )Nr   )r   r@   rB   rC   rD   )datarE   s    r-   make_float8z!float8_split.<locals>.make_float8   s?    #GNGG%G$
 
 	
r/   )rA   r4   r@   maprX   )r2   rE   rF   new_data_tensorsrb   outs    `    r-   float8_splitrf      s~    wtAw}BtABBxBBB6BBWd1gn555
 
 
 
 
 k+
,
,C99r/   c                    |d         }|d         j         }|d         j        }|d         j        }|d         j        j        }|d         j        }g }	|D ]}
t          |
t                    s
J d            |
j         |k    s
J d            |
j        |u s
J d            |
j        |u s
J d            |
j        j        |k    s
J d            |
j        |u s
J d            t          | |
j                   |		                    |
j        
                    t          j                              | |	g|dd          R i |}|
                    |          }t          |||||          S )	Nr   z7Expecting all chunks to be of type Float8TrainingTensorz,Expecting all chunks to be of the same dtypezCExpecting all chunks to have thee same scale as a result of a splitzGExpecting all chunks to have thee same mm config as a result of a splitzCExpecting all chunks to be of the same dtype as a result of a splitzLExpecting all chunks to have the same gemm_input_role as a result of a splitr   )rB   r@   rC   rA   dtyperD   
isinstancer   r4   appendviewr    uint8)r2   rE   rF   chunked_tensorsr*   r3   	mm_config	fp8_dtypegemm_input_role
chunk_datachunkrG   s               r-   
float8_catrs     s   377O #/JA%E"4I"(.I%a(9OJ  9 9%!566 	
 	
E	
 	
6  J...: /.. |u$$$Q %$$ &)333U 433 { I---Q .-- %888Z 988 	!%,777%+**5;778888wz7DH77777H}}Y''H%YXXXr/   c                     t          | |d         j                   d }t          ||          }t          ||          } | |i |S )a)  Be careful with this function, this is a "fallback" op that
    casts the output of the op to the original precision. And performs the op.

    We currently need this to support the backward for admmm bias.
    "addmm" -> out
    "hp_gradBias" <-"sum" <- "identity" <- gradOut <- "hp_gradOut"
    r   c                 X    t          | t                    r|                                 S | S N)ri   r   to_original_precision)xs    r-   unwrapz!float8_cast_up_op.<locals>.unwrap8  s,    a-.. 	-**,,,r/   )r4   r@   r   )r2   rE   rF   ry   new_args
new_kwargss         r-   float8_cast_up_opr|   -  s^     Wd1gn555  
 %%H&&))J7H+
+++r/   abc                    | j         }| j        }|j         }t          | j        | j        |j        |j                  }|j        r| j                             d          |j                             d          k    s@J d| j                             d           d|j                             d                       t          |d          }t          |d          }t          |	                                          s|
                                }t          |	                                          r8|                                
                                                                }|j        }| j        <|j        5|                    |j        d                                       dd          }nB| j        ;|j        4|                    |j        d                                       dd          }||||fS )Nr   r   z"Inner dims must match for mm, got z and )dimsrM   )rA   r@   r	   rD   rC   pad_inner_dimsizer   r
   stride
contiguousrZ   rR   repeatr   reshape)r}   r~   r   r   r   scaled_mm_configr   s          r-   preprocess_addmmr   B  s   WFhGWF.					  % 7w||A!',,q//111XaXXqw||TUXX 211 'vA666&vA666(( %""$$FMMOO$$ -&&((**,,hG 	1?#>..a1199"a@@	
	$)@..a1199!R@@7FG++r/   c           
         |d         }|d         }t          |t                    rt          |t                    s8J d                    t          |          t          |                                t	          ||          \  }}}}|j        }	t          |j        |j        |j        |j                  }
|
j	        rft          j        |j                                        |j        z  |j                                        |j        z                                |	          S t!          |||||	d d |
j                  }|S )Nr   r   zFExpecting  both Float8TrainingTensor for mm inputs but found {} and {}r   r   r   )ri   r   formattyper   rB   r	   rD   rC   emulater    mmrA   floatr@   r%   r.   r   )r2   rE   rF   r}   r~   r   r   r   r   r   r   
tensor_outs               r-   	float8_mmr   k  sN   QAQAa-.. :	4 4  OVVQa   
 (81'='=$FGVW=L.					   
x!(2AGMMOOah4NOORR
 
 	
 ('6	 	 	J r/   c           
         t          |d         t          j                  r6t          |d         t                    rt          |d         t                    sJ |d         }|d         }|d         }t	          ||          \  }}}}	|j        }
|j        |
k    s
J d            t          |j        |j	        |j        |j	                  }|j
        rkt          j        |j                                        |j        z  |j                                        |j        z                                |
          }||z   S t!          ||||	|
d ||j                  }|S )Nr   r   rW   z"bias dtype must match output dtyper   )ri   r    Tensorr   r   rB   rh   r	   rD   rC   r   r   rA   r   r@   r%   r.   r   )r2   rE   rF   r   r}   r~   r   r   r   r   r   r   re   r   s                 r-   float8_addmmr     sl    	47EL))tAw 455 tAw 455 	6 7DQAQA'71'='=$FGVW=L:%%%'K%%%.					   hqw}}117==??QX3MNNQQ
 
 Tz''6	 	 	J r/   c                 p    t          | |d         j                   |d         j        |d         j        k    S r?   )r4   r@   r   r2   rE   rF   s      r-   float8_is_same_sizer     s/    Wd1gn5557=DGM))r/   c           	      t   t          |d         t                    sJ t          |          dk    rd|v s
J d            |d         t          j        t          j        hv s
J d            t          |d         j        |d         j        |d         |d         j        |d         j	        |d         j
                  S )zThis gets called when running matmul under autocast
    when the input is a Float8TrainingTensor, presenting as a fp32
    tensor.
    r   r   rh   z%Only support dtype kwarg for autocastzKOnly support floating point conversion for autocast w/ Float8TrainingTensor)ri   r   r1   r    r!   r#   rA   r@   rC   rD   rR   r   s      r-   autocast_to_copyr     s     d1g344444v;;!6 1 1 1/ !2 11 '?    U    QQwQ!Q Q  r/   c                 R   t          | |d         j                   |d         }t          |t                    sJ dt	          |                       |j        }|                                } | |g|dd         R i |}t          ||j        |j        |j        |j	                  S )z+
    override funcol with FP8 handling
    r   z9expecting a Float8TrainingTensor for allgather but found r   N)
r4   r@   ri   r   r   rA   r   rB   rC   rD   r2   rE   rF   	fp8_inputfp8_datafp8_outs         r-   allgather_fp8r     s     Wd1gn555QIi!566  UDOOUU 6 H""$$Hgh4abb444V44G#"  r/   c                     t          | |d         j                   |d         }t          |t                    sJ |j        } | |g|dd          R i |}t          ||j        |j        |j        |j                  S r?   )r4   r@   ri   r   rA   rB   rC   rD   r   s         r-   wait_tensor_fp8r     s    Wd1gn555QIi!566666Hgh4abb444V44G#"  r/   c                    |d         }|d         }t          |t                    sJ t          |t                    sJ t          ||d         j                   |j        |j        k    sJ |j        |j        k    sJ |j        |j        k    sJ |j        }|j        } | ||d         |g|dd          R i |}t          ||j        |j        |j        |j                  S )Nr   rW   r      )	ri   r   r4   r@   rh   rB   rA   rC   rD   )r2   rE   rF   fp8_self
fp8_valuesr   fp8_values_datar   s           r-   index_put_fp8r     s   AwHaJh 455555j"677777XtAw~666?j/////>Z-----:#99999~H &OghQN48NNNvNNG"!  r/   c                 2   |d         }|d         }t          |t                    sSt          |t                    r>|                                }t          | |j                    | ||g|dd          R i |S t          |t                    rt          |t                    rt          | |j                   |j        |j        k    s
J d            |j        |j        k    s
J d            |j        |j        k    s
J d            |j        j        |j        j        k    s
J d            |j	        |j	        k    s
J d             | |j        |j        g|dd          R i |}t          ||j        |j        |j        |j	                  S t          d	          )
Nr   r   rW   z<Expecting both Float8TrainingTensors to be of the same dtypez<Expecting both Float8TrainingTensors to have thee same scalez@Expecting both Float8TrainingTensors to have thee same mm configz=Expecting both Float8TrainingTensors to be of the same dtypetzEExpecting both Float8TrainingTensors to have the same gemm_input_rolez7Unsupported semantics for copy_ in Float8TrainingTensor)ri   r   rw   r4   r@   rB   rC   rA   rh   rD   r7   )r2   rE   rF   selfsrcsrc_hpr   s          r-   copy_fp8r     s    7D
q'Cd011 "Vj!7 7 "V **,, #*555wtV9d122h999&999	D.	/	/ VJ!5 5 V 	!#*5553?222J 322 {cj(((J )(( %)>>>>N ?>> z39?222K 322 $(<<<<S =<< '$*ciE$qrr(EEEfEE#K"!
 
 	
 TUUUr/   )NNFrv   )Dtypingr   r   r   r   r    torch.utils._pytreer   %torchao.float8.float8_training_tensorr   r	   torchao.float8.float8_utilsr
   r   opsrO   c10d_functional_c10d_functionalr   __annotations__r   rh   boolr.   r4   r=   _unsafe_viewdefault
as_stridedcloneslicefill_Scalarr   rH   detachrK   rZ   rP   rQ   rU   rk   r^   splitrf   catrs   sumdim_IntListr|   r   r   matmulr   addmmr   is_same_sizer   _to_copyr   all_gather_into_tensorr   wait_tensorr   
index_put_r   copy_r   r   r/   r-   <module>r      s    . - - - - - - - - - - - -  ( ( ( ( ( (        L K K K K K K Ky~)+9- #% $sCx. % % % ,0#' ? ?L?\? L? \	?
 +? 5<(? 5<
 ? ? \? ? ? ?D? ? ?   !


	 		 	 		 		  
	 	 	 
	     8 TY  3 3 3 ! 3l TZ     ! $ TX!Y !Y !Y  !YH TX!"##, , , $#,(&,, &,1E &, &, &, &,R TW_dk1233   43D TZ !!! ! ! "!!H T&'((* * * )(*
 T]"#$$   %$. .6/7    , _(02B2N2VWXX   YX  T_$%&&   '&, TZ !!-V -V -V "!-V -V -Vr/   