
    `i6                        d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z
 d dlmZ d dlmZ g dZd	d
dZd Z G d dej                  Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z ed          Z ed          Z e            Z e            Z e            Z e            Z dS )     )runtime)_compile)_cuda_types)BuiltinFunc)Constant)Data)wraps_class_method)	this_gridthis_thread_blocksyncwait
wait_priormemcpy_asyncz[#include <cuda/barrier>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
z,#include <cooperative_groups/memcpy_async.h>)cgcg_memcpy_asyncc                     t          | j        d|           }|du rE| j        j                            t          |                    t          | j        d| d           d S d S )Ninclude_FT)getattr	generatedcodesappend_header_to_codesetattr)envheaderflags      `/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cupyx/jit/cg.py_check_includer      sm    3="5V"5"566Du}}""?6#:;;;2&22D99999 }    c                   (    e Zd ZdZdZd Zd Zd ZdS )_ThreadGroupz( Base class for all cooperative groups. Nc                     t           N)NotImplementedErrorselfs    r   __init__z_ThreadGroup.__init__&   s    !!r   c                     | j          S r#   
child_typer%   s    r   __str__z_ThreadGroup.__str__)   s    /##r   c                 f    t          |d           t          |j         dt          j                  S )Nr   z.sync()r   _Datacoder   voidr&   r   instances      r   _syncz_ThreadGroup._sync,   s0    sD!!!...0@AAAr   )__name__
__module____qualname____doc__r*   r'   r+   r3    r   r   r!   r!   !   sQ        22J" " "$ $ $B B B B Br   r!   c                        e Zd ZdZd Zed             Ze fd            Zed             Zed             Z	ed             Z
ed             Zed	             Zed
             Zed             Zed             Z xZS )
_GridGroupa  A handle to the current grid group. Must be created via :func:`this_grid`.

    .. seealso:: `CUDA Grid Group API`_, :class:`numba.cuda.cg.GridGroup`

    .. _CUDA Grid Group API:
        https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#grid-group-cg
    c                     d| _         d S )Nzcg::grid_groupr)   r%   s    r   r'   z_GridGroup.__init__:   s    *r   c                 f    t          |d           t          |j         dt          j                  S )zU
        is_valid()

        Returns whether the grid_group can synchronize.
        r   z.is_valid())r   r.   r/   r   bool_r1   s      r   is_validz_GridGroup.is_valid=   s2     	sD!!!222K4EFFFr   c                 `    d|j         _        t                                          ||          S )z
        sync()

        Synchronize the threads named in the group.

        .. seealso:: :meth:`numba.cuda.cg.GridGroup.sync`
        T)r   	enable_cgsuperr3   r&   r   r2   	__class__s      r   r   z_GridGroup.syncG   s'     #'ww}}S(+++r   c                 f    t          |d           t          |j         dt          j                  S z`
        thread_rank()

        Rank of the calling thread within ``[0, num_threads)``.
        r   z.thread_rank()r   r.   r/   r   uint64r1   s      r   thread_rankz_GridGroup.thread_rankU   2     	sD!!!555{7IJJJr   c                     t          j                    dk     rt          d          t          |d           t	          |j         dt          j                  S )z]
        block_rank()

        Rank of the calling block within ``[0, num_blocks)``.
        4+  z'block_rank() is supported on CUDA 11.6+r   z.block_rank()_runtime_getLocalRuntimeVersionRuntimeErrorr   r.   r/   r   rG   r1   s      r   
block_rankz_GridGroup.block_rank_   U     +--55HIIIsD!!!444k6HIIIr   c                     t          j                    dk     rt          d          t          |d           t	          |j         dt          j                  S zN
        num_threads()

        Total number of threads in the group.
        rK   z(num_threads() is supported on CUDA 11.6+r   z.num_threads()rL   r1   s      r   num_threadsz_GridGroup.num_threadsk   U     +--55IJJJsD!!!555{7IJJJr   c                     t          j                    dk     rt          d          t          |d           t	          |j         dt          j                  S )zL
        num_blocks()

        Total number of blocks in the group.
        rK   z'num_blocks() is supported on CUDA 11.6+r   z.num_blocks()rL   r1   s      r   
num_blocksz_GridGroup.num_blocksw   rQ   r   c                     t          j                    dk     rt          d          t          |d           t	          |j         dt          j                  S )z[
        dim_blocks()

        Dimensions of the launched grid in units of blocks.
        rK   z'dim_blocks() is supported on CUDA 11.6+r   z.dim_blocks()rM   rN   rO   r   r.   r/   r   dim3r1   s      r   
dim_blocksz_GridGroup.dim_blocks   sU     +--55HIIIsD!!!444k6FGGGr   c                     t          j                    dk     rt          d          t          |d           t	          |j         dt          j                  S )zc
        block_index()

        3-Dimensional index of the block within the launched grid.
        rK   z(block_index() is supported on CUDA 11.6+r   z.block_index()rY   r1   s      r   block_indexz_GridGroup.block_index   U     +--55IJJJsD!!!555{7GHHHr   c                 f    t          |d           t          |j         dt          j                  S zG
        size()

        Total number of threads in the group.
        r   z.size()rF   r1   s      r   sizez_GridGroup.size   2     	sD!!!...0BCCCr   c                 f    t          |d           t          |j         dt          j                  S )zZ
        group_dim()

        Dimensions of the launched grid in units of blocks.
        r   .group_dim()r   r.   r/   r   rZ   r1   s      r   	group_dimz_GridGroup.group_dim   2     	sD!!!333[5EFFFr   )r4   r5   r6   r7   r'   _wraps_class_methodr>   r   rH   rP   rT   rW   r[   r]   ra   rf   __classcell__rC   s   @r   r:   r:   1   sy        + + + G G G , , , , , K K K 	J 	J 	J 	K 	K 	K 	J 	J 	J 	H 	H 	H 	I 	I 	I D D D G G G G G G Gr   r:   c                        e Zd ZdZd Ze fd            Zed             Zed             Zed             Z	ed             Z
ed             Zed	             Zed
             Z xZS )_ThreadBlockGroupa  A handle to the current thread block group. Must be
    created via :func:`this_thread_block`.

    .. seealso:: `CUDA Thread Block Group API`_

    .. _CUDA Thread Block Group API:
        https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-block-group-cg
    c                     d| _         d S )Nzcg::thread_blockr)   r%   s    r   r'   z_ThreadBlockGroup.__init__   s    ,r   c                 H    t                                          ||          S )zM
        sync()

        Synchronize the threads named in the group.
        )rA   r3   rB   s      r   r   z_ThreadBlockGroup.sync   s     ww}}S(+++r   c                 f    t          |d           t          |j         dt          j                  S rE   r   r.   r/   r   uint32r1   s      r   rH   z_ThreadBlockGroup.thread_rank   rI   r   c                 f    t          |d           t          |j         dt          j                  S )zc
        group_index()

        3-Dimensional index of the block within the launched grid.
        r   z.group_index()re   r1   s      r   group_indexz_ThreadBlockGroup.group_index   s2     	sD!!!555{7GHHHr   c                 f    t          |d           t          |j         dt          j                  S )zf
        thread_index()

        3-Dimensional index of the thread within the launched block.
        r   z.thread_index()re   r1   s      r   thread_indexz_ThreadBlockGroup.thread_index   s2     	sD!!!6668HIIIr   c                     t          j                    dk     rt          d          t          |d           t	          |j         dt          j                  S )z^
        dim_threads()

        Dimensions of the launched block in units of threads.
        rK   z(dim_threads() is supported on CUDA 11.6+r   z.dim_threads()rY   r1   s      r   dim_threadsz_ThreadBlockGroup.dim_threads   r^   r   c                     t          j                    dk     rt          d          t          |d           t	          |j         dt          j                  S rS   )rM   rN   rO   r   r.   r/   r   rq   r1   s      r   rT   z_ThreadBlockGroup.num_threads   rU   r   c                 f    t          |d           t          |j         dt          j                  S r`   rp   r1   s      r   ra   z_ThreadBlockGroup.size   rb   r   c                 f    t          |d           t          |j         dt          j                  S )z\
        group_dim()

        Dimensions of the launched block in units of threads.
        r   rd   re   r1   s      r   rf   z_ThreadBlockGroup.group_dim	  rg   r   )r4   r5   r6   r7   r'   rh   r   rH   rs   ru   rw   rT   ra   rf   ri   rj   s   @r   rl   rl      s;        - - - , , , , , K K K I I I J J J 	I 	I 	I 	K 	K 	K D D D G G G G G G Gr   rl   c                   *     e Zd Zd Z fdZd Z xZS )_ThisCgGroupc                     |dk    rd}d}n|dk    rd}d}nt           || _        d| d| d	| d
| _        |dk    r| xj        dz  c_        d S d S )Ngridz
grid groupr:   thread_blockzthread block grouprl   z
        Returns the current z (:class:`~cupyx.jit.cg.z/`).

        .. seealso:: :class:`cupyx.jit.cg.`z!, :func:`numba.cuda.cg.this_grid`)r$   
group_typer7   )r&   r   nametypenames       r   r'   z_ThisCgGroup.__init__  s    D#HH>))'D*HH%%$9!9 9;C9 9 ,49 9 9 LL??LLLL  r   c                 H    t                                                       d S r#   rA   __call__)r&   rC   s    r   r   z_ThisCgGroup.__call__(  s    r   c                     t           j        rt          d          | j        dk    rt	                      }n| j        dk    rt                      }t          d| j         d|          S )Nz)cooperative group is not supported on HIPr~   r   z	cg::this_z())rM   is_hiprO   r   r:   rl   r.   )r&   r   cg_types      r   
call_constz_ThisCgGroup.call_const+  sj    ? 	LJKKK?f$$ llGG_..'))G4444g>>>r   )r4   r5   r6   r'   r   r   ri   rj   s   @r   r|   r|     sY        @ @ @"    ? ? ? ? ? ? ?r   r|   c                   $     e Zd Z fdZd Z xZS )_Syncc                 H    t                                                       dS )a  Calls ``cg::sync()``.

        Args:
            group: a valid cooperative group

        .. seealso:: `cg::sync`_

        .. _cg::sync:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-sync
        Nr   r&   grouprC   s     r   r   z_Sync.__call__7  !     	r   c                     t          |j        t                    st          d          t	          |d           t          d|j         dt          j                  S )Nz'group must be a valid cooperative groupr   z	cg::sync())	
isinstancectyper!   
ValueErrorr   r.   r/   r   r0   r&   r   r   s      r   callz
_Sync.callD  sW    %+|44 	HFGGGsD!!!....0@AAAr   r4   r5   r6   r   r   ri   rj   s   @r   r   r   5  sN            B B B B B B Br   r   c                   0     e Zd Zdd fd
ZdddZ xZS )_MemcpySyncN)aligned_sizec                H    t                                                       dS )a  Calls ``cg::memcpy_sync()``.

        Args:
            group: a valid cooperative group
            dst: the destination array that can be viewed as a 1D
                C-contiguous array
            dst_idx: the start index of the destination array element
            src: the source array that can be viewed as a 1D C-contiguous
                array
            src_idx: the start index of the source array element
            size (int): the number of bytes to be copied from
                ``src[src_index]`` to ``dst[dst_idx]``
            aligned_size (int): Use ``cuda::aligned_size_t<N>`` to guarantee
                the compiler that ``src``/``dst`` are at least N-bytes aligned.
                The behavior is undefined if the guarantee is not held.

        .. seealso:: `cg::memcpy_sync`_

        .. _cg::memcpy_sync:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-memcpy-async
        Nr   )	r&   r   dstdst_idxsrcsrc_idxra   r   rC   s	           r   r   z_MemcpySync.__call__M  s!    . 	r   c                   t          |d           t          |d           t          j        ||          }t          j        ||          }||fD ]<}	t          |	j        t
          j        t
          j        f          st          d          =t          j
        |||          }t          j
        |||          }t          j        |t
          j        d|          }t          j        ||          }|j         }
|r2t          |t                    st          d          d|j         d|
 d}
t          d	|j         d
|j         d|j         d|
 d	t
          j                  S )Nr   r   zdst/src must be of array type.	same_kindz,aligned_size must be a compile-time constantzcuda::aligned_size_t<>(r   zcg::memcpy_async(z, &(z), &(z), )r   r.   initr   r   r   CArrayPtr	TypeErrorr   	_indexing_astype_scalarrq   r/   	_Constantr   objr0   )r&   r   r   r   r   r   r   ra   r   arr	size_codes              r   r   z_MemcpySync.callf  s   sD!!!s-...jc""jc"": 	B 	BCI 2KODF F B @AAAB  gs33 gs33& +$k38 8 z$$$yN	 	+lI66 D BD D D*1A * *&* * *I 4 4 4 4 4(4 4'04 4 45@5EG G 	Gr   r   rj   s   @r   r   r   K  sn         #      4 G G G G G G G G Gr   r   c                   $     e Zd Z fdZd Z xZS )_Waitc                 H    t                                                       dS )a  Calls ``cg::wait()``.

        Args:
            group: a valid cooperative group

        .. seealso: `cg::wait`_

        .. _cg::wait:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-wait
        Nr   r   s     r   r   z_Wait.__call__  r   r   c                 h    t          |d           t          d|j         dt          j                  S )Nr   z	cg::wait(r   r-   r   s      r   r   z
_Wait.call  s3    sD!!!....0@AAAr   r   rj   s   @r   r   r     sN            B B B B B B Br   r   c                   $     e Zd Z fdZd Z xZS )
_WaitPriorc                 H    t                                                       dS )aX  Calls ``cg::wait_prior<N>()``.

        Args:
            group: a valid cooperative group
            step (int): wait for the first ``N`` steps to finish

        .. seealso: `cg::wait_prior`_

        .. _cg::wait_prior:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-wait
        Nr   r   s     r   r   z_WaitPrior.__call__  s!     	r   c                     t          |d           t          |t                    st          d          t	          d|j         d|j         dt          j                  S )Nr   z$step must be a compile-time constantzcg::wait_prior<r   r   )	r   r   r   r   r.   r   r/   r   r0   )r&   r   r   steps       r   r   z_WaitPrior.call  sf    sD!!!$	** 	ECDDD@tx@@5:@@@ %' ' 	'r   r   rj   s   @r   r   r     sG            ' ' ' ' ' ' 'r   r   r~   r   N)!	cupy.cudar   rM   	cupyx.jitr   r   cupyx.jit._internal_typesr   _BuiltinFuncr   r   r   r.   r	   rh   __all__r   r   TypeBaser!   r:   rl   r|   r   r   r   r   r
   r   r   r   r   r   r8   r   r   <module>r      s   ) ) ) ) ) )       ! ! ! ! ! ! A A A A A A ; ; ; ; ; ; 3 3 3 3 3 3 O O O O O O9 9 92 F	 : : :B B B B B;' B B B ~G ~G ~G ~G ~G ~G ~G ~GB`G `G `G `G `G `G `G `GF? ? ? ? ?< ? ? ?@B B B B BL B B B,7G 7G 7G 7G 7G, 7G 7G 7GtB B B B BL B B B(' ' ' ' ' ' ' '0 L  	 L00 uwwuwwZ\\
{}}r   