
    Pim                     4   d dl mZmZmZmZ d dlZd dlmZ d dl	m
Z
mZ d dlmZ dZej                                        oej        j        duZej        ej        ej        ej        hZ ej                    	 d!dej        dej        d	efd
            Z ej                    ddej        dfdej        dededee         dej        f
d            Z  ej                    ddej        ddfdej        dej        dededee         d	edej        fd            Z!dej        dej        fdZ"dej        dej        dej        fdZ#dej        dej        deedf         fdZ$d Z%dededefdZ&dej        deeee         f         dej        fdZ'dej        fd Z(dS )"    )IterableOptionalTupleUnionN)AsyncCollectiveTensor
all_reduce)ScalingGranularityg-q=Famaxfloat8_dtyperound_scales_to_power_of_2c                 F   |                      t          j                  } |t          v rUt          j        |          j        t          j        | t                    z  }|                     t          j                  }nt          d|           |rt          |          }|S )zConverts the amax value of a tensor to the fp8 scale.
    Args:
        amax: The amax value of the tensor.
        float8_dtype: The float8 dtype.
        round_scales_to_power_of_2: if true, round scaling factor down to the nearest power of 2.
    )minUnsupported float8_dtype: )totorchfloat64	FP8_TYPESfinfomaxclampEPSfloat32
ValueError_round_scale_down_to_power_of_2)r
   r   r   ress       o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/float8/float8_utils.pyamax_to_scaler      s     775=!!Dy  k,''+ek$C.H.H.HHffU]##DlDDEEE! 3-c22J    xreduce_amaxscaling_granularityaxiswise_dimreturnc                 :   |t           j        u r't          j        t          j        |                     }nM|t           j        u s
J d            |
J d            t          j        t          j        |           |d          }|rt          j                    r||	                                nd }|-t          t          t          j                                        n|}t          |d|          }t          |t                    r|                                }|S )NunsupportedT)dimkeepdimMAX)r	   
TENSORWISEr   r   absAXISWISEr
   distis_initialized	get_grouplistrangeget_world_sizer   
isinstancer   wait)r   r    device_meshr!   r"   r
   pggroups           r   tensor_to_amaxr7   8   s    0;;;y1&&"&8&AAAA=AAA'''''z%)A,,L$GGG
  t*,, (3(?[""$$$T68jU4.0011222b$u--d122 	99;;DKr   	hp_tensorc                 L    t          | ||||          }t          |||          S )a  
    Compute scaling factor for the given high precision tensor.

    Args:
        hp_tensor: high precision tensor
        float8_dtype: the float8 dtype to use
        reduce_amax: whether to reduce the max(abs(hp_tensor)) value across distributed ranks
        scaling_granularity: Defines the scaling granularity
        axiswise_dim: if axiswise granularity is used, defines the dim to scale across
        round_scales_to_power_of_2: if true, round scaling factor down to the nearest power of 2.
    )r   )r7   r   )r8   r   r    r4   r!   r"   r   r
   s           r   tensor_to_scaler:   U   sC    *  D l7Q   r   c                     |t           v rFt          j        |          j        }|                     | |          } |                     |          S t          d|           )a  Converts a tensor to a saturated fp8 tensor.

    Note:
        The default behavior in PyTorch for casting to `float8_e4m3fn`
        and `e5m2` is to not saturate. In this context, we should saturate.
        A common case where we want to saturate is when the history of a
        tensor has a maximum value of `amax1`, and the current amax value
        is `amax2`, where `amax1 < amax2`. This is common when using delayed
        scaling.
    )r   r   r   )r   r   r   r   r   r   r   )r   r   	max_values      r   to_fp8_saturatedr=   v   sa     y  K--1	GG
	G22ttL!!!DlDDEEEr   yc                     t           j                            |           }t           j                            | |z
            }dt          j        ||z            z  S )zComputes the error between two tensors in dB.

    For more details see:
        https://en.wikipedia.org/wiki/Signal-to-noise_ratio

    Args:
        x: The original tensor.
        y: The tensor to compare to the original tensor.
       )r   linalgvector_normlog10)r   r>   PsPns       r   compute_errorrF      sK     
	!	!!	$	$B		!	!!a%	(	(BBG$$$$r   tensor.c                    |t           v rt          j        |          j        }nt	          d|           | j                            | j                  }t          j        |          |k    	                                
                                }|dk    	                                
                                }||fS )zCalculate FP8 tensor stats

    Args:
        tensor: The tensor to calculate stats for.
        float8_dtype: The float8 dtype.

    Returns:
        A tuple containing the number of zeros and the number of max values.
    r   )dtyper   )r   r   r   r   r   _datar   _orig_dtyper*   sumitem)rG   r   FP8_MAXtensor_orig_typenum_maxnum_zeros         r   fp8_tensor_statisticsrR      s     y  +l++/DlDDEEE|V-?@@y)**g5::<<AACCG A%**,,1133Hgr   c                 x    t          |           dk    s
J d            | d         | d         k    o| d         dk    S )N   z%is_row_major only supports 2D tensorsr      )len)strides    r   is_row_majorrX      sB    v;;!D!9vay 3VAY!^3r   sizealignment_valuec                     d| dz
  |z  z   |z  S )a  
    Returns the minimum alignment value that is greater than or equal to the given size.

    Args:
        size: The size of the data to be aligned.
        alignment_value: The alignment value to be used.

    Returns:
        int: The minimum alignment value that is greater than or equal to the given size.

    Usage:
    ```
        >>> _get_min_alignment(10, 8)
        16
    ```
    rU    )rY   rZ   s     r   _get_min_alignmentr]      s    " $(./?BBr   dimsc                 8   |                                  dk    sJ | j        \  }}t          |t                    r|f}d|v rt	          |d          n|}d|v rt	          |d          n|}||z
  }||z
  }t
          j        j                            | d|d|f          S )aP  
    Pads a 2D tensor with zeros to ensure that its dimensions are multiples of 16, which is required `torch._scaled_mm`

    Args:
        tensor: The tensor to pad.
        dims: Dimensions to pad.

    Returns:
        torch.Tensor: The padded tensor.

    Usage:
    ```
        >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=0).shape
        torch.Size([16, 10])
        >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=1).shape
        torch.Size([10, 16])
        >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=(0, 1)).shape
        torch.Size([16, 16])
    ```
    rT   r      rU   )	r&   shaper2   intr]   r   nn
functionalpad)rG   r^   dim1dim2dim1_aligneddim2_alignedpad_dim1pad_dim2s           r   pad_tensor_for_matmulrl      s    . ::<<1JD$$ w 4599%dB///$L3499%dB///$L d"Hd"H8""6AxH+EFFFr   scalec                     | j         t          j        k    s
J d            t          j        t          j        t          j        |                               S )Nzscale must be float32 tensor)rI   r   r   exp2floorlog2)rm   s    r   r   r      sD    ;%-''')G''':ek%*U"3"344555r   )F))typingr   r   r   r   r   torch.distributeddistributedr,   )torch.distributed._functional_collectivesr   r   torchao.float8.configr	   r   cudais_availableversionhipIS_ROCMfloat8_e4m3fnfloat8_e5m2float8_e4m3fnuzfloat8_e5m2fnuzr   no_gradTensorrI   boolr   r)   rb   r7   r:   r=   rF   rR   rX   r]   rl   r   r\   r   r   <module>r      sI   4 3 3 3 3 3 3 3 3 3 3 3              W W W W W W W W 4 4 4 4 4 4 
*
!
!
#
#
E(9(E						  (- 
,+ !%   0  .@.K"& | ,	
 3- \   8  .@.K"&', |+ 
 , 3- !% \   @F FEK F F F F&%U\ %el %u| % % % %L(-
38_   ,4 4 4
CS C3 C3 C C C C(%GL%G %c8C=&8 9%G
\%G %G %G %GP65< 6 6 6 6 6 6r   