
    Pi:'                        d Z ddlZddlmZmZmZmZmZ ddlZddl	m
Z
mZ ddlmZ ddlmZmZmZ ddlmZmZ ej        Z G d d	e          Zd
edededeeef         fdZdej        deedf         fdZ	 	 	 d-d
ededededej        dee         dee         dedefdZdej        dee         dededed edej        fd!Zd"ej        defd#Zd"ej        defd$Zd"ej        defd%Z d"ej        defd&Z!d'eeeeef         e"e         f         defd(Z#d)eeeeeef         e"e         f                  deeef         fd*Z$d+eeef         ddfd,Z%dS ).z;
Defines an nn module designed to be used during inference
    N)List
NamedTupleOptionalTupleUnion)is_row_majorpad_tensor_for_matmul)FP8Granularity)PerBlockPerRow	PerTensor)is_MI300is_sm_at_least_89c                   >    e Zd ZU dZdZeed<   dZeed<   dZeed<   dS )Float8MMConfiga  
    Configuration for the scaled_mm in the forward and backward pass.

    Attributes:
        emulate (bool): Whether to emulate the matmuls in fp32.
        use_fast_accum (bool): Whether to use the fast-accumulation option for scaled_mm.
        pad_inner_dim (bool): Whether to pad the inner dimension of a and b with 0s.
                              This is needed for matmuls not aligned to 16.
    Femulateuse_fast_accumpad_inner_dimN)	__name__
__module____qualname____doc__r   bool__annotations__r   r        l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/float8/inference.pyr   r      sN           GT ND   M4r   r   a_datab_datascaled_mm_configreturnc                 <   |j         r|                     d          |                    d          k    s6J d|                     d           d|                    d                       t          | d          } t          |d          }t          |                                           s|                                 } t          |                                          r8|                                                                                                }| |fS )a  Preprocess the inner fp8 data tensors for admmm
    Args:
        a_data: Input tensor A.
        b_data: Input tensor B.
        scaled_mm_config: Configuration for _scaled_mm.
    Returns:
        Preprocessed tensors A and B in the format for _scaled_mm.
       r   z"Inner dims must match for mm, got z and )dims)r   sizer	   r   stride
contiguoust)r   r   r    s      r   preprocess_datar)   .   s     % 7{{1~~Q///VQVVfkkRSnnVV 0// 'vA666&vA666(( %""$$FMMOO$$ -&&((**,,6>r   input_scaleinput_shape.c                     |                                  dk    r|                     dd          S |                     d          } |                                 dk    r!|                     d| j        d                   } | S )z:Ensures input tensor is correctly formatted for _scaled_mmr#      )numelreshape	unsqueezedimshape)r*   r+   s     r   preprocess_scaler4   H   s     a""1a((( ''++K 1!))"k.?.CDDr   Fa_scaleb_scaleoutput_dtypeoutput_scalebiasr   c           
          |t           j        k    r"| t          j        | ||||||          }||z   S t          j        | |||||||          S )z
    This is the unwrapped version of addmm_float8, which does not take in Float8TrainingTensors
    as inputs. This is used to standardize the logic between subclassed and non subclassed
    versions of the linear module.
    N)scale_ascale_bscale_result	out_dtyper   )r;   r<   r9   r=   r>   r   )torchfloat32
_scaled_mm)	r   r5   r   r6   r7   r8   r9   r   outputs	            r    addmm_float8_unwrapped_inferencerC   Z   s      u}$$)9!%")
 
 
 }!%	 	 	 	r   scale
data_shaper2   startendstepc                     t           j        j        } j        k    r|j                             ||||          S t           fdt          t                              D                       }|t          |          k    r S ||         }|dk    r|j                             ||||          S |||z  nd}	|||z   dz
  |z  nd}
|dk    rt          d          |j                             ||	|
d          S )z
    Slice the scale tensor appropriately based on the data tensor slicing.
    This function calculates how the scale should be sliced when the data tensor
    is sliced along a given dimension, taking into account the block structure.
    c              3   D   K   | ]}|         j         |         z  V  d S )N)r3   ).0irE   rD   s     r   	<genexpr>z-_slice_scale_for_dimension.<locals>.<genexpr>   s2      XXA
1Q7XXXXXXr   r#   Nz;Slicing with step > 1 is not implemented for scale tensors.)
r?   opsatenr3   sliceTensortuplerangelenNotImplementedError)rD   rE   r2   rF   rG   rH   rO   block_sizesblock_size_for_dimscale_start	scale_ends   ``         r   _slice_scale_for_dimensionrZ      s>    9>D {j  z  UC>>> XXXXXs:AWAWXXXXXK
c+$S)Q z  UC>>> 6;5Fe111D  %%).@@@ 	 !88%M   z  [)QGGGr   xc                     t          | d          s
J d            t          | j                  d|                                 dz
  z  | j        d         fz   k    S )~Checks if a quantized tensor is rowwise scaled
    Args:
        x: quantized tensor (should have `block_size` attribute)
    
block_size.Expecting input to have `block_size` attribute)r#   r#   r-   )hasattrrR   r^   r2   r3   r[   s    r   _is_rowwise_scaledrb      sV    
 1l##UU%UUU#$!%%''A+"6!'"+"GGGr   c                      t           d          s
J d            t           fdt           j                  D                       S )r]   r^   r_   c              3   r   K   | ]1}j         |         d k    pj         |         j        |         k    V  2dS )r-   N)r^   r3   )rK   rL   r[   s     r   rM   z(_is_tensorwise_scaled.<locals>.<genexpr>   sU        CDQ2>aAGAJ!>     r   )r`   allrS   ndimra   s   `r   _is_tensorwise_scaledrg      sd    
 1l##UU%UUU#    HMaf     r   c                     t          | d          s
J d            | j        }t          |          dk    o+t          j        |dd                   dk    o|d         dk    S )zChecks if a quantized tensor is scaled with a block size of 1x128
    Args:
        x: quantized tensor (should have `block_size` attribute)
    r^   r_   r.   Nr-   r#      )r`   r^   rT   mathprodr[   bs     r   _is_1_128_scaledrn      sc    
 1l##UU%UUU#	Aq66Q;B49QssV,,1BaeslBr   c                     t          | d          s
J d            | j        }t          |          dk    o|d         dk    o|d         dk    S )zChecks if a quantized tensor is scaled with a block size of 128x128
    Args:
        x: quantized tensor (should have `block_size` attribute)
    r^   r_   r.   r   ri   r#   )r`   r^   rT   rl   s     r   _is_128_128_scaledrp      sT    
 1l##UU%UUU#	Aq66Q;61Q43;61Q43;6r   gc                     t          |           dk    o5| d         t          ddg          k    o| d         t          ddg          k    S )Nr.   r   r#   ri   )rT   r   )rq   s    r   !_granularity_is_a_1_128_w_128_128rs      sI     q66Q;V1Q48QH#5#55V!A$(CQT:BVBV:VVr   granularityc                    d }| t                      t                      f}nDt          | t           t          f          r| | f}n"t          | t          t          f          rt          |           dk    rt          | d         t                     ot          | d         t                     }t          | d         t                    ot          | d         t                    }t          |           }|s|s|st          d|  d          t          | d         t          | d                             st          d|  d          t          |           }nt          d|  d          |S )Nr.   r   r#   zUnsupported granularity types: .zEDifferent granularities for activation and weight are not supported: z#Invalid granularity specification: )	r   
isinstancer   rR   listrT   rs   
ValueErrortype)rt   processed_granularityis_per_tensor
is_per_rowis_a_1_128_w_128_128s        r   _normalize_granularityr      s    !!*ikk :	K)V!4	5	5 O!,k :	K%	/	/ OC4D4D4I4I";q>9== 
*NIC
 C
  A77 
JNF=
 =

  AMM 	O 	O/C 	OM{MMMNNN+a.${1~*>*>?? 	fXcfff   !&k 2 2M{MMMNNN  r   granularitiesc                    t          | d         t                    ot          | d         t                    }t          | d         t                    ot          | d         t                    }t          |           }|s|r*t	                      st                      sJ d            dS dS |rt	                      s
J d            dS t          d|  d          )a9  
    Validate that the hardware supports the requested granularities.

    Args:
        granularities: Tuple of (activation_granularity, weight_granularity)

    Raises:
        AssertionError: If hardware doesn't support the requested granularity
        ValueError: If invalid granularity type is provided
    r   r#   uN   Float8 dynamic quantization requires CUDA compute capability ≥8.9 or MI300+.u[   Float8 1x128 activation and 128x128 weight scaling requires CUDA compute capability ≥8.9.zInvalid granularities rv   N)rw   r   r   rs   r   r   ry   )r   r|   r}   r~   s       r   _check_hardware_supportr   
  s    }Q/;; 
a)A AM M!,f55 *a&; ;J =]KK 
D
 
D "" 	
hjj 	
 	
\	
 	
0 	
 	
 	
 	
 
 D "" 	
 	
i	
 	
" 	
 	
 B-BBBCCCr   )NNF)&r   rj   typingr   r   r   r   r   r?   torchao.float8.float8_utilsr   r	   torchao.float8.typesr
    torchao.quantization.granularityr   r   r   torchao.utilsr   r   rQ   r   r)   intr4   dtyper   rC   rZ   rb   rg   rn   rp   rx   rs   r   r   r   r   r   <module>r      s     ; ; ; ; ; ; ; ; ; ; ; ; ; ;  K K K K K K K K / / / / / /         
       
 
         Z        % 66>	   4%, U38_    0 &*! % %%% % 	%
 +% 6"% 6
% % % % % %P0H<0HS	0H 
0H 	0H
 
0H 0H \0H 0H 0H 0HfH%, H4 H H H HU\ d    C C C C C C7%, 74 7 7 7 7Wnn,-^	W 
W W W W !..01 "	
 ! >>)* !  !  !  !FD78D	D D D D D Dr   