
    `iB              	          U d dl mZmZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dl	mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dl	mZ d dlmZ  G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d d e          Z G d! d"e          Ze e            e  e            e! e            e" e            iZ#eeef         e$d#<    e            Z% e            Z& e            Z' e            Z( ed$          Z) ed%          Z* e            Z+ ed&d'ej,        rd(nd)z             Z- ed*d+          Z. ed,d-          Z/ ed.d/          Z0 ed0d/          Z1 ed1d2          Z2 ed3d2          Z3 ed4d/ej,        rd(nd5z             Z4 ed6d/          Z5 ed7d/          Z6 ed8d/          Z7d9ej,        rd(nd:z   Z8 ed;e8          Z9 ed<e8          Z: ed=e8          Z; ed>e8          Z<dS )?    )AnyMappingN)runtime)device)_cuda_types)_cuda_typerules)BuiltinFunc)Data)Constant)Range)_compile)reducec                   0     e Zd Zdd fd
ZdddZ xZS )	RangeFuncNunrollc                H    t                                                       dS )a  Range with loop unrolling support.

        Args:
            start (int):
                Same as that of built-in :obj:`range`.
            stop (int):
                Same as that of built-in :obj:`range`.
            step (int):
                Same as that of built-in :obj:`range`.
            unroll (int or bool or None):

                - If `True`, add ``#pragma unroll`` directive before the
                  loop.
                - If `False`, add ``#pragma unroll(1)`` directive before
                  the loop to disable unrolling.
                - If an `int`, add ``#pragma unroll(n)`` directive before
                  the loop, where the integer ``n`` means the number of
                  iterations to unroll.
                - If `None` (default), leave the control of loop unrolling
                  to the compiler (no ``#pragma``).

        .. seealso:: `#pragma unroll`_

        .. _#pragma unroll:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#pragma-unroll
        Nsuper__call__)selfr   args	__class__s      l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cupyx/jit/_builtin_funcs.pyr   zRangeFunc.__call__   s!    6 	    c                P   t          |          dk    rt          d          t          |          dk    r't          d          |d         t          d          }}}nlt          |          dk    r |d         |d         t          d          }}}n9t          |          dk    r|\  }}}nt          dt          |                     |t          d ||||fD                       st          d          |j        }t          |t                    s9t          |t                    s$t          d	t          |          j	                   |d
u rd}|du s#d|cxk     rdk     sn t          j        d           t          |t                    r|j        dk    }n|j        j        j        dk    rd}nd }t          j        ||          }t          j        ||          }t          j        ||          }|j        j        j        dvrt          d          |j        j        j        dvrt          d          |j        j        j        dvrt          d          |j        dk    rt%          j        t                    }n|j        dk    r|j        }nJ t)          ||||||          S )Nr   z)range expected at least 1 argument, got 0         z'range expected at most 3 argument, got c              3   @   K   | ]}t          |t                    V  d S N)
isinstancer   ).0xs     r   	<genexpr>z!RangeFunc.call.<locals>.<genexpr>@   s@       = = "!X.. = = = = = =r   zCloop unrolling requires constant start, stop, step and unroll valuez-unroll value expected to be of type int, got FTl        zUloop unrolling is ignored as the unroll value is non-positive or greater than INT_MAXuiuz%range supports only for integer type.numpycudar   )len	TypeErrorr   allobjr"   intbooltype__name__warningswarnctypedtypekindr
   initmoder   Scalarr   )	r   envr   r   startstopstepstep_is_positiver4   s	            r   callzRangeFunc.call2   s   t99>>GHHHYY!^^ (T!Whqkk4EEYY!^^ $Qa(1++4EEYY!^^ $E4E#d))EEG G G  = =!&dF ;= = = = = $#$ $ $ ZFvs++ 4z&$/G/G 43<<03 34 4 4 dNNa&&:&:&:&:7&:&:&:&:;< < < dH%% 	$#x1}Z"c))##ys##	%%%ys##;!--CDDD: ,,CDDD: ,,CDDD8w&s++EEXJEE5UD$/?OOOOr   r1   
__module____qualname__r   r?   __classcell__r   s   @r   r   r      sl        %)       : '+ 8P 8P 8P 8P 8P 8P 8P 8P 8Pr   r   c                       e Zd Zd ZdS )LenFuncc                    t          |          dk    rt          dt          |                     |rt          d          |d         }t          |j        t          j                  st          d          |j        j        st          d          t          d|j         dt	          j	        d	                    S )
Nr   z#len() expects only 1 argument, got #keyword arguments are not supportedr   zlen() supports only array typezlen() of unsized arrayzstatic_cast<long long>(z.shape()[0])q)
r*   r+   r"   r4   r   CArrayndimr
   coder9   )r   r:   r   kwdsargs        r   r?   zLenFunc.callo   s    t99>>M#d))MMNNN 	CABBB1g#)[%788 	><===y~ 	64555DchDDD&s++- - 	-r   Nr1   rA   rB   r?    r   r   rF   rF   m   s#        - - - - -r   rF   c                       e Zd Zd ZdS )MinFuncc                     t          |          dk     rt          dt          |                     |rt          d          t          fd|          S )Nr   z(min() expects at least 2 arguments, got rH   c                 J    t          j        t          j        | |fd           S r!   )r   _call_ufunccupyminimumabr:   s     r   <lambda>zMinFunc.call.<locals>.<lambda>   #    8#7L1a&$$- $- r   r*   r+   r   r   r:   r   rM   s    `  r   r?   zMinFunc.call   y    t99q==F3t99FFH H H 	CABBB - - - -.24 4 	4r   NrO   rP   r   r   rR   rR   }   #        4 4 4 4 4r   rR   c                       e Zd Zd ZdS )MaxFuncc                     t          |          dk     rt          dt          |                     |rt          d          t          fd|          S )Nr   z(max() expects at least 2 arguments, got rH   c                 J    t          j        t          j        | |fd           S r!   )r   rU   rV   maximumrX   s     r   r[   zMaxFunc.call.<locals>.<lambda>   r\   r   r]   r^   s    `  r   r?   zMaxFunc.call   r_   r   NrO   rP   r   r   rb   rb      r`   r   rb   c                   $     e Zd Z fdZd Z xZS )SyncThreadsc                 H    t                                                       dS )zCalls ``__syncthreads()``.

        .. seealso:: `Synchronization functions`_

        .. _Synchronization functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#synchronization-functions
        Nr   r   r   s    r   r   zSyncThreads.__call__   !     	r   c                 6    t          dt          j                  S )Nz__syncthreads())r
   r   voidr   r:   s     r   
call_constzSyncThreads.call_const   s    %{'7888r   r1   rA   rB   r   rn   rC   rD   s   @r   rg   rg      sG            9 9 9 9 9 9 9r   rg   c                   0     e Zd Zdd fd
ZdddZ xZS )SyncWarp    )maskc                H    t                                                       dS )a:  Calls ``__syncwarp()``.

        Args:
            mask (int): Active threads in a warp. Default is 0xffffffff.

        .. seealso:: `Synchronization functions`_

        .. _Synchronization functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#synchronization-functions
        Nr   )r   rs   r   s     r   r   zSyncWarp.__call__   !     	r   Nc                   t           j        r"| t          j        d| dt                     d }|rzt          |t                    r#d|j        cxk    rdk    sn t          d          t          j
        |t          j        d|          }t          j        ||          }d|j         d}nd	}t          |t          j                  S )
Nmask  is ignored on HIPr   rr   mask is out of range	same_kindz__syncwarp()z__syncwarp())r   is_hipr2   r3   RuntimeWarningr"   r   r-   
ValueErrorr   _astype_scalarr   int32r
   r7   rL   rl   )r   r:   rs   rL   s       r   r?   zSyncWarp.call   s    > 	>d>>>OOO 		"$)) =tx5555:5555$%;<<<*k'c; ;D9T3''D----DD!DD+*+++r   r@   rD   s   @r   rq   rq      sc        )        !% , , , , , , , , ,r   rq   c                   (     e Zd Zd fd	ZddZ xZS )SharedMemoryNc                 H    t                                                       dS )a  Allocates shared memory and returns it as a 1-D array.

        Args:
            dtype (dtype):
                The dtype of the returned array.
            size (int or None):
                If ``int`` type, the size of static shared memory.
                If ``None``, declares the shared memory with extern specifier.
            alignment (int or None): Enforce the alignment via __align__(N).
        Nr   )r   r5   size	alignmentr   s       r   r   zSharedMemory.__call__   ru   r   c                 
   |                     d          }t          j        |          }t          |t	          j        |||                    }||j        |<   ||j        |<   t          |t	          j        |                    S )N_smem)prefix)	get_fresh_variable_namer   to_ctyper
   r   	SharedMemdeclslocalsPtr)r   r:   r5   r   r   namer4   vars           r   rn   zSharedMemory.call_const   sw    **'*::(//4.udIFFGG	$
4D+/%00111r   r!   ro   rD   s   @r   r   r      sQ             2 2 2 2 2 2 2 2r   r   c                   .     e Zd Zd Zd fd	ZddZ xZS )AtomicOpc                 Z    || _         d|z   | _        || _        d| j         d}|| _        d S )NatomicCalls the ``a  `` function to operate atomically on
        ``array[index]``. Please refer to `Atomic Functions`_ for detailed
        explanation.

        Args:
            array: A :class:`cupy.ndarray` to index over.
            index: A valid index such that the address to the corresponding
                array element ``array[index]`` can be computed.
            value: Represent the value to use for the specified operation. For
                the case of :obj:`atomic_cas`, this is the value for
                ``array[index]`` to compare with.
            alt_value: Only used in :obj:`atomic_cas` to represent the value
                to swap to.

        .. seealso:: `Numba's corresponding atomic functions`_

        .. _Atomic Functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions

        .. _Numba's corresponding atomic functions:
            https://numba.readthedocs.io/en/stable/cuda-reference/kernel.html#synchronization-and-atomic-operations
        _op_name_dtypes__doc__r   opdtypesdocs       r   __init__zAtomicOp.__init__   sB    ]
tz   , r   Nc                 H    t                                                       d S r!   r   )r   arrayindexvalue	alt_valuer   s        r   r   zAtomicOp.__call__       r   c                 *   | j         }| j        }t          j        ||          }t	          |j        t          j        t          j        f          st          d          t          j        |||          }|j        }	|	j        j        | j        vrt          d| d|	j         d          t          j        ||	d|          }t          j        ||          }|dk    r|J |	j        j        dk    r3t#          t%          j                              dk     rt)          d	          t          j        ||	d|          }t          j        ||          }| d
|j         d|j         d|j         d}
n|J | d
|j         d|j         d}
t          |
|	          S )Nz)The first argument must be of array type.`` does not support  input.rz   CASHF   z5uint16 atomic operation is not supported before sm_70z(&, r{   )r   r   r
   r7   r"   r4   r   rJ   r   r+   r   	_indexingr5   r   r   r   charr.   r   get_compute_capabilityRuntimeErrorrL   )r   r:   r   r   r   value2r   r   targetr4   rL   s              r   r?   zAtomicOp.call   s   zX	%%%%+(:KO'LMM 	IGHHH#E5#66;4<//MMMMMMNNN'uk3GG	%%%;;%%%{3&&v46677"<<& ! ! ! ,VUKMMFYvs++FIIfkIIUZII6;IIIDD>>>::fk::UZ:::DD%   r   r!   r1   rA   rB   r   r   r?   rC   rD   s   @r   r   r      s`          8     ! ! ! ! ! ! ! !r   r   c                   *     e Zd Zd Z fdZd Z xZS )GridFuncc                     |dk    rd| _         d| _        d| _        d| _        n2|dk    rd| _         d| _        d	| _        d
| _        nt	          d          d| j          d| j         d| j         d}|| _        d S )Ngridz%Compute the thread index in the grid.z1jit.threadIdx.x + jit.blockIdx.x * jit.blockDim.xznumba.cuda.gridz+threadIdx.{n} + blockIdx.{n} * blockDim.{n}gridsizezCompute the grid size.zjit.blockDim.x * jit.gridDim.xznumba.cuda.gridsizezblockDim.{n} * gridDim.{n}zunsupported functionz        zH

        Computation of the first integer is as follows::

            a  

        and for the other two integers the ``y`` and ``z`` attributes are used.

        Args:
            ndim (int): The dimension of the grid. Only 1, 2, or 3 is allowed.

        Returns:
            int or tuple:
                If ``ndim`` is 1, an integer is returned, otherwise a tuple.

        .. note::
            This function follows the convention of Numba's
            :func:`z`.
        )_desc_eq_link_coder~   r   )r   r8   r   s      r   r   zGridFunc.__init__   s    6>>@DJJDH*DJFDJJZ1DJ7DH.DJ5DJJ34444:   X	 " J#  & r   c                 H    t                                                       d S r!   r   )r   rK   r   s     r   r   zGridFunc.__call__C  r   r   c                     t          |t                    st          d          |dk    r3t           j                            d          t          j                  S |dk    rd}n|dk    rd}nt          d	          d
	                     fd|D                       }t          j
        t          j        g|z            }|dk    rt          d| d|          S t          d| d|          S )Nzndim must be an integerr   r$   nr   )r$   yr   )r$   r   zzOnly ndim=1,2,3 are supportedr   c              3   N   K   | ]}j                             |           V   dS )r   N)r   format)r#   r   r   s     r   r%   z&GridFunc.call_const.<locals>.<genexpr>U  s6      CCdj//!/44CCCCCCr   zSTD::make_pair(r{   zSTD::make_tuple()r"   r.   r+   r
   r   r   r   uint32r~   joinTuple)r   r:   rK   dims	elts_coder4   s   `     r   rn   zGridFunc.call_constF  s   $$$ 	75666 199
))C)00+2DEEEQYYDDQYY"DD<===IICCCCdCCCCC	!;#5"6t";<<1996)666>>>79777???r   )r1   rA   rB   r   r   rn   rC   rD   s   @r   r   r     s^        ! ! !F    @ @ @ @ @ @ @r   r   c                   6     e Zd Zd Zdd fd
ZdddZ xZS )WarpShuffleOpc                 n    || _         d|r|dz   ndz   dz   | _        || _        d| j         d}|| _        d S )N__shfl__ syncr   z`` function. Please refer to
        `Warp Shuffle Functions`_ for detailed explanation.

        .. _Warp Shuffle Functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
        r   r   s       r   r   zWarpShuffleOp.__init__`  sU    b"8"s((b9FB
tz    r       )widthc                H    t                                                       d S r!   r   )r   rs   r   val_idr   r   s        r   r   zWarpShuffleOp.__call__l  r   r   Nc                   | j         }t          j        ||          }|j        }|j        j        | j        vrt          d| d|j         d          	 |j        }n# t          $ r t          d          w xY wt          j        rt          j        d| dt                     nd|cxk    rdk    sn t          d	          | j        d
v rt"          j        }nt"          j        }t)          j        ||d|          }t          j        ||          }|r.t-          |t.                    r|j        dvrt          d          n*t          j        rt/          d          nt/          d          }t)          j        |t"          j        d|          }t          j        ||          }| dt1          |           d|j         d|j         }	|	d|j         dz  }	t          |	|          S )Nr   r   r   zmask must be an integerrw   rx   r   rr   ry   )updownrz   )r            r   zwidth needs to be power of 2@   r   (r   r{   )r   r
   r7   r4   r5   r   r   r+   r-   	Exceptionr   r|   r2   r3   r}   r~   r   r   r   r   r   r   r"   r   hexrL   )
r   r:   rs   r   r   r   r   r4   val_id_trL   s
             r   r?   zWarpShuffleOp.callo  s
   ziS!!	;4<//MMMMMMNNN	78DD 	7 	7 	75666	7> 	5M:$:::NKKKK++++++++3444 8~%%")HH"(H(;LL63'' 	E%** E9$555$%CDDD$+NDHRLLLE';$k38 8	%%%??T??ch??&+??"UZ""""D%   s   A A5r   rD   s   @r   r   r   ^  st        
 
 
 46        59 &! &! &! &! &! &! &! &! &!r   r   c                   *     e Zd Z fdZd Zd Z xZS )LaneIDc                 H    t                                                       dS )zReturns the lane ID of the calling thread, ranging in
        ``[0, jit.warpsize)``.

        .. note::
            Unlike :obj:`numba.cuda.laneid`, this is a callable function
            instead of a property.
        Nr   ri   s    r   r   zLaneID.__call__  rj   r   c                 8    d}t           j        s|dz  }n|dz  }|S )Nz2__device__ __forceinline__ unsigned int LaneId() {z
                unsigned int ret;
                asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
                return ret; }
            z3
                return __lane_id(); }
            )r   r|   )r   preambles     r   _get_preamblezLaneID._get_preamble  s:    G~ 	  HH   H r   c                     |j                             |                                            t          dt          j                  S )NzLaneId())	generatedadd_coder   r
   r   r   rm   s     r   rn   zLaneID.call_const  s7    t1133444J 2333r   )r1   rA   rB   r   r   rn   rC   rD   s   @r   r   r     sV               4 4 4 4 4 4 4r   r   builtin_functions_dictr   r   Add)r   r   uint64float32float64rP   )float16Sub)r   r   Exch)r   r   r   r   Min)r   r   r   MaxInc)r   Decr   )uint16AndOrXor)r   r   int64r   r   )r   r   r   r   r   xor)=typingr   r   r2   rV   cupy_backends.cuda.apir   	cupy.cudar   	cupyx.jitr   r   cupyx.jit._internal_typesr	   r
   r   r   r   	functoolsr   r   rF   rR   rb   rg   rq   r   r   r   r   r   ranger*   minmaxr   __annotations__range_syncthreadssyncwarpshared_memoryr   r   laneidr|   
atomic_add
atomic_subatomic_exch
atomic_min
atomic_max
atomic_inc
atomic_dec
atomic_cas
atomic_and	atomic_or
atomic_xor_shfl_dtypes	shfl_syncshfl_up_syncshfl_down_syncshfl_xor_syncrP   r   r   <module>r     s	              * * * * * *       ! ! ! ! ! ! % % % % % % 1 1 1 1 1 1 * * * * * * . . . . . . + + + + + +            WP WP WP WP WP WP WP WPt- - - - -k - - - 	4 	4 	4 	4 	4k 	4 	4 	4	4 	4 	4 	4 	4k 	4 	4 	49 9 9 9 9+ 9 9 9 , , , , ,{ , , ,D2 2 2 2 2; 2 2 20<! <! <! <! <!{ <! <! <!~=@ =@ =@ =@ =@{ =@ =@ =@@7! 7! 7! 7! 7!K 7! 7! 7!t4 4 4 4 4[ 4 4 4B 
99;;	5 [ 01    
kmm8::x8J	 X	7^-rr/0 0
 X	   
h
46 6X	(* *
X	(* *
X	; 
X	; 
X	!^,rr./ /
 X	(* *
H
') )	X	(* *

 7^6rr!68  M"l++	}T<00v|44e\22r   