
    `iU                        d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d Zej        ej        ej        gZej        ej        ej        ej        gZej        ej        gZej        ej        ej        ej        gZeez   ez   ez   Zd  eee          D             Zd	 eD             Zd
 eD             Z dZ!dZ" ej#        e!dd e D             d eD             z   d eD             z   d eD             z             Z$ ej#        e"dd e D             d eD             z   d eD             z   d eD             z   d eD             z   d eD             z             Z%d Z&d Z'd Z(d Z)d#dZ*d  Z+	 	 d$d"Z,dS )%    )productN)_normalize_axis_index)get_typename)runtime)
axis_slicec                 R    t          |           }|dk    rt          j        rd}nd}|S )Nfloat16__halfhalf)r   r   is_hip)dtypetypenames     q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cupyx/scipy/signal/_iir_utils.py_get_typenamer      s7    E""H9> 	  HHHO    c                 n    g | ]2\  }}t          j        ||          t          j        |          u .||f3S  )cupypromote_typesr   .0xys      r   
<listcomp>r      sI     < < <A#Aq))TZ]]:: !f:::r   c                 ,    g | ]}t          |          S r   r   )r   ts     r   r   r       s     ...1mA...r   c                 P    g | ]#\  }}t          |          t          |          f$S r   r   r   s      r   r   r   !   s1    OOODAqM!$$mA&6&67OOOr   a  
#include <cupy/math_constants.h>
#include <cupy/carray.cuh>
#include <cupy/complex.cuh>

template<typename U, typename T>
__global__ void compute_correction_factors(
        const int m, const int k, const T* b, U* out) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx >= k) {
        return;
    }

    U* out_start = out + idx * (k + m);
    U* out_off = out_start + k;

    for(int i = 0; i < m; i++) {
        U acc = 0.0;
        for(int j = 0; j < k; j++) {
            acc += ((U) b[j]) * out_off[i - j - 1];

        }
        out_off[i] = acc;
    }
}

template<typename T>
__global__ void first_pass_iir(
        const int m, const int k, const int n, const int n_blocks,
        const int carries_stride, const T* factors, T* out,
        T* carries) {
    int orig_idx = blockDim.x * (blockIdx.x % n_blocks) + threadIdx.x;

    int num_row = blockIdx.x / n_blocks;
    int idx = 2 * orig_idx + 1;

    if(idx >= n) {
        return;
    }

    int group_num = idx / m;
    int group_pos = idx % m;

    T* out_off = out + num_row * n;
    T* carries_off = carries + num_row * carries_stride;

    T* group_start = out_off + m * group_num;
    T* group_carries = carries_off + k * group_num;

    int pos = group_pos;
    int up_bound = pos;
    int low_bound = pos;
    int rel_pos;

    for(int level = 1, iter = 1; level < m; level *=2, iter++) {
        int sz = min(pow(2.0f, ((float) iter)), ((float) m));

        if(level > 1) {
            int factor = ceil(pos / ((float) sz));
            up_bound = sz * factor - 1;
            low_bound = up_bound - level + 1;
        }

        if(level == 1) {
            pos = low_bound;
        }

        if(pos < low_bound) {
            pos += level / 2;
        }

        if(pos + m * group_num >= n) {
            break;
        }

        rel_pos = pos % level;
        T carry = 0.0;
        for(int i = 1; i <= min(k, level); i++) {
            T k_value = group_start[low_bound - i];
            const T* k_factors = factors + (m + k) * (i - 1) + k;
            T factor = k_factors[rel_pos];
            carry += k_value * factor;
        }

        group_start[pos] += carry;
        __syncthreads();
    }

    if(pos >= m - k) {
        if(carries != NULL) {
            group_carries[pos - (m - k)] = group_start[pos];
        }
    }

}

template<typename T>
__global__ void correct_carries(
    const int m, const int k, const int n_blocks, const int carries_stride,
    const int offset, const T* factors, T* carries) {

    int idx = threadIdx.x;
    int pos = idx + (m - k);
    T* row_carries = carries + carries_stride * blockIdx.x;

    for(int i = offset; i < n_blocks; i++) {
        T* this_carries = row_carries + k * (i + (1 - offset));
        T* prev_carries = row_carries + k * (i - offset);

        T carry = 0.0;
        for(int j = 1; j <= k; j++) {
            const T* k_factors = factors + (m + k) * (j - 1) + k;
            T factor = k_factors[pos];
            T k_value = prev_carries[k - j];
            carry += factor * k_value;
        }

        this_carries[idx] += carry;
        __syncthreads();
    }
}

template<typename T>
__global__ void second_pass_iir(
        const int m, const int k, const int n, const int carries_stride,
        const int n_blocks, const int offset, const T* factors,
        T* carries, T* out) {

    int idx = blockDim.x * (blockIdx.x % n_blocks) + threadIdx.x;
    idx += offset * m;

    int row_num = blockIdx.x / n_blocks;
    int n_group = idx / m;
    int pos = idx % m;

    if(idx >= n) {
        return;
    }

    T* out_off = out + row_num * n;
    T* carries_off = carries + row_num * carries_stride;
    const T* prev_carries = carries_off + (n_group - offset) * k;

    T carry = 0.0;
    for(int i = 1; i <= k; i++) {
        const T* k_factors = factors + (m + k) * (i - 1) + k;
        T factor = k_factors[pos];
        T k_value = prev_carries[k - i];
        carry += factor * k_value;
    }

    out_off[idx] += carry;
}
a  
#include <cupy/math_constants.h>
#include <cupy/carray.cuh>
#include <cupy/complex.cuh>

template<typename T>
__global__ void pick_carries(
        const int m, const int n, const int carries_stride, const int n_blocks,
        const int offset, T* x, T* carries) {

    int idx = m * (blockIdx.x % n_blocks) + threadIdx.x + m - 2;
    int pos = threadIdx.x;
    int row_num = blockIdx.x / n_blocks;
    int n_group = idx / m;

    T* x_off = x + row_num * n;
    T* carries_off = carries + row_num * carries_stride;
    T* group_carries = carries_off + (n_group + (1 - offset)) * 2;

    if(idx >= n) {
        return;
    }

    group_carries[pos] = x_off[idx];
}

template<typename U, typename T>
__global__ void compute_correction_factors_sos(
        const int m, const T* f_const, U* all_out) {

    extern __shared__ __align__(sizeof(T)) thrust::complex<double> bc_d[2];
    T* b_c = reinterpret_cast<T*>(bc_d);

    extern __shared__ __align__(sizeof(T)) thrust::complex<double> off_d[4];
    U* off_cache = reinterpret_cast<U*>(off_d);

    int idx = threadIdx.x;
    int num_section = blockIdx.x;

    const int n_const = 6;
    const int a_off = 3;
    const int k = 2;
    const int off_idx = 1;

    U* out = all_out + num_section * k * m;
    U* out_start = out + idx * m;
    const T* b = f_const + num_section * n_const + a_off + 1;

    b_c[idx] = b[idx];
    __syncthreads();

    U* this_cache = off_cache + k * idx;
    this_cache[off_idx - idx] = 1;
    this_cache[idx] = 0;

    for(int i = 0; i < m; i++) {
        U acc = 0.0;
        for(int j = 0; j < k; j++) {
            acc += -((U) b_c[j]) * this_cache[off_idx - j];

        }
        this_cache[0] = this_cache[1];
        this_cache[1] = acc;
        out_start[i] = acc;
    }
}


template<typename T>
__global__ void first_pass_iir_sos(
        const int m, const int n, const int n_blocks,
        const T* factors, T* out, T* carries) {

    extern __shared__ unsigned int thread_status[2];
    extern __shared__ __align__(sizeof(T)) thrust::complex<double> fc_d[2 * 1024];
    T* factor_cache = reinterpret_cast<T*>(fc_d);

    int orig_idx = blockDim.x * (blockIdx.x % n_blocks) + threadIdx.x;

    int num_row = blockIdx.x / n_blocks;
    int idx = 2 * orig_idx + 1;
    const int k = 2;

    if(idx >= n) {
        return;
    }

    int group_num = idx / m;
    int group_pos = idx % m;
    T* out_off = out + num_row * n;
    T* carries_off = carries + num_row * n_blocks * k;

    T* group_start = out_off + m * group_num;
    T* group_carries = carries_off + group_num * k;

    const T* section_factors = factors;
    T* section_carries = group_carries;

    factor_cache[group_pos] = section_factors[group_pos];
    factor_cache[group_pos - 1] = section_factors[group_pos - 1];
    factor_cache[m + group_pos] = section_factors[m + group_pos];
    factor_cache[m + group_pos - 1] = section_factors[m + group_pos - 1];
    __syncthreads();

    int pos = group_pos;
    int up_bound = pos;
    int low_bound = pos;
    int rel_pos;

    for(int level = 1, iter = 1; level < m; level *= 2, iter++) {
        int sz = min(pow(2.0f, ((float) iter)), ((float) m));

        if(level > 1) {
            int factor = ceil(pos / ((float) sz));
            up_bound = sz * factor - 1;
            low_bound = up_bound - level + 1;
        }

        if(level == 1) {
            pos = low_bound;
        }

        if(pos < low_bound) {
            pos += level / 2;
        }

        if(pos + m * group_num >= n) {
            break;
        }

        rel_pos = pos % level;
        T carry = 0.0;
        for(int i = 1; i <= min(k, level); i++) {
            T k_value = group_start[low_bound - i];
            const T* k_factors = factor_cache + m  * (i - 1);
            T factor = k_factors[rel_pos];
            carry += k_value * factor;
        }

        group_start[pos] += carry;
        __syncthreads();
    }

    if(pos >= m - k) {
        if(carries != NULL) {
            section_carries[pos - (m - k)] = group_start[pos];
        }
    }
}

template<typename T>
__global__ void correct_carries_sos(
    const int m, const int n_blocks, const int carries_stride,
    const int offset, const T* factors, T* carries) {

    extern __shared__ __align__(sizeof(T)) thrust::complex<double> fcd3[4];
    T* factor_cache = reinterpret_cast<T*>(fcd3);

    int idx = threadIdx.x;
    const int k = 2;
    int pos = idx + (m - k);
    T* row_carries = carries + carries_stride * blockIdx.x;

    factor_cache[2 * idx] = factors[pos];
    factor_cache[2 * idx + 1] = factors[m + pos];
    __syncthreads();

    for(int i = offset; i < n_blocks; i++) {
        T* this_carries = row_carries + k * (i + (1 - offset));
        T* prev_carries = row_carries + k * (i - offset);

        T carry = 0.0;
        for(int j = 1; j <= k; j++) {
            // const T* k_factors = factors + m * (j - 1);
            // T factor = k_factors[pos];
            T factor = factor_cache[2 * idx + (j - 1)];
            T k_value = prev_carries[k - j];
            carry += factor * k_value;
        }

        this_carries[idx] += carry;
        __syncthreads();
    }
}

template<typename T>
__global__ void second_pass_iir_sos(
        const int m, const int n, const int carries_stride,
        const int n_blocks, const int offset, const T* factors,
        T* carries, T* out) {

    extern __shared__ __align__(sizeof(T)) thrust::complex<double> fcd2[2 * 1024];
    T* factor_cache = reinterpret_cast<T*>(fcd2);

    extern __shared__ __align__(sizeof(T)) thrust::complex<double> c_d[2];
    T* carries_cache = reinterpret_cast<T*>(c_d);

    int idx = blockDim.x * (blockIdx.x % n_blocks) + threadIdx.x;
    idx += offset * m;

    int row_num = blockIdx.x / n_blocks;
    int n_group = idx / m;
    int pos = idx % m;
    const int k = 2;

    T* out_off = out + row_num * n;
    T* carries_off = carries + row_num * carries_stride;
    const T* prev_carries = carries_off + (n_group - offset) * k;

    if(pos < k) {
        carries_cache[pos] = prev_carries[pos];
    }

    if(idx >= n) {
        return;
    }

    factor_cache[pos] = factors[pos];
    factor_cache[pos + m] = factors[pos + m];
    __syncthreads();

    T carry = 0.0;
    for(int i = 1; i <= k; i++) {
        const T* k_factors = factor_cache + m * (i - 1);
        T factor = k_factors[pos];
        T k_value = carries_cache[k - i];
        carry += factor * k_value;
    }

    out_off[idx] += carry;
}

template<typename T>
__global__ void fir_sos(
        const int m, const int n, const int carries_stride, const int n_blocks,
        const int offset, const T* sos, T* carries, T* out) {

    extern __shared__ __align__(sizeof(T)) thrust::complex<double> fir_cc[1024 + 2];
    T* fir_cache = reinterpret_cast<T*>(fir_cc);

    extern __shared__ __align__(sizeof(T)) thrust::complex<double> fir_b[3];
    T* b = reinterpret_cast<T*>(fir_b);

    int idx = blockDim.x * (blockIdx.x % n_blocks) + threadIdx.x;
    int row_num = blockIdx.x / n_blocks;
    int n_group = idx / m;
    int pos = idx % m;
    const int k = 2;

    T* out_row = out + row_num * n;
    T* out_off = out_row + n_group * m;
    T* carries_off = carries + row_num * carries_stride;
    T* this_carries = carries_off + k * (n_group + (1 - offset));
    T* group_carries = carries_off + (n_group - offset) * k;

    if(pos <= k) {
        b[pos] = sos[pos];
    }

    if(pos < k) {
        if(offset && n_group == 0) {
            fir_cache[pos] = 0;
        } else {
            fir_cache[pos] = group_carries[pos];
        }
    }

    if(idx >= n) {
        return;
    }

    fir_cache[pos + k] = out_off[pos];
    __syncthreads();

    T acc = 0.0;
    for(int i = k; i >= 0; i--) {
        acc += fir_cache[pos + i] * b[k - i];
    }

    out_off[pos] = acc;
}
)z
-std=c++11c                 &    g | ]\  }}d | d| dS )zcompute_correction_factors<, >r   r   s      r   r   r     s?     3 3 3!Q >A===== 3 3 3r   c                     g | ]}d | d	S )zcorrect_carries<r!   r   r   r   s     r   r   r     $    BBB!----BBBr   c                     g | ]}d | d	S )zfirst_pass_iir<r!   r   r#   s     r   r   r     s$    AAA,,,,AAAr   c                     g | ]}d | d	S )zsecond_pass_iir<r!   r   r#   s     r   r   r     r$   r   )codeoptionsname_expressionsc                 &    g | ]\  }}d | d| dS )zcompute_correction_factors_sos<r    r!   r   r   s      r   r   r     s@     3 3 3!Q BAAQAAA 3 3 3r   c                     g | ]}d | d	S )zpick_carries<r!   r   r#   s     r   r   r     s$    ...aQ...r   c                     g | ]}d | d	S )zcorrect_carries_sos<r!   r   r#   s     r   r   r     $    555Q A   555r   c                     g | ]}d | d	S )zfirst_pass_iir_sos<r!   r   r#   s     r   r   r     s$    444A1444r   c                     g | ]}d | d	S )zsecond_pass_iir_sos<r!   r   r#   s     r   r   r     r-   r   c                     g | ]}d | d	S )zfir_sos<r!   r   r#   s     r   r   r     s     )))____)))r   c                     d |D             }d                     |          }|r| d| dn|}|                     |          }|S )Nc                 6    g | ]}t          |j                  S r   )r   r   )r   args     r   r   z$_get_module_func.<locals>.<listcomp>  s"    EEE=++EEEr   r    <r!   )joinget_function)module	func_nametemplate_argsargs_dtypestemplatekernel_namekernels          r   _get_module_funcr>     s^    EE}EEEKyy%%H0=LY,,,,,,9K  --FMr   c                     t          j        | |d          } | j        }|                     d| j        d                   } | j        j        s|                                 } | |fS )Nr   moveaxisshapereshapeflagsc_contiguouscopyr   axisx_shapes      r   collapse_2drK     sY    ar""AgG			"agbk""A7 FFHHg:r   c                     t          j        | |dz   d          } | j        }|                     | j        d         d| j        d                   } | j        j        s|                                 } | |fS )N   r@   r   rA   rH   s      r   collapse_2d_restrN     se    a2&&AgG			!'!*b!'"+..A7 FFHHg:r   c                     | j         }t          j        ||          }t          j        |d d d         t          j        ||f|          f         }t          t          d||           } ||fd||| |f           |S )Nr   r@   compute_correction_factorsrM   )sizer   eyec_emptyr>   
IIR_MODULE)ablock_szr   k
correctioncorr_kernels         r   rQ   rQ     s    	A!5)))J44R4$*a]%@@@@BJ"0*aA AKKdXq!Z8999r   r@      c                 >   |t          j        | j        |j                  }|                    |          }||                    |          }| j        }| j        }t          ||          }|j        }||         }	|dk    r(t          | |          \  } }|t          ||          \  }}
t          j	        | |d          }| j        dk    rdn| j        d         }|	|z   dz
  |z  }||z  }t          j
        ||          }t           j        |d d d         t          j        ||f|          f         }t          j        |||f|          }t          t          d||          }t          t          d|          }t          t          d	|          }t          t          d
|          } ||fd||||f            ||f|dz  f|||	|||z  |||f           ||j        dk    rt          j        ||d|j        f          }n-|j        dk    r"|                    |d|j        d                   }|j        dk    r|}nt          j        ||fd          }|j        j        s|                                }|dk    s|Rt+          |d u           }||z
  }|d|z
  z   |z  } ||f|f|||||||f            |||z  f|f|||	||||||f	           |dk    rK|                    |          }t          j        |d|          }|j        j        s|                                }|S )NrM   Tr   rG   r   rP   r@   rQ   first_pass_iirsecond_pass_iircorrect_carriesrR      )rI   )r   result_typer   astyperC   ndimr   rS   rK   arrayrT   rU   rV   r>   rW   broadcast_torD   concatenaterE   rF   rG   intrB   )r   rX   rI   zir   rY   rJ   x_ndimrZ   n_outnum_rowsn_blockstotal_blocksr[   carriesr\   first_pass_kernelsecond_pass_kernelcarry_correction_kernelstarting_groupblocks_to_mergecarries_strides                           r   	apply_iirrz     s    } !'22	A	~YYugGVF v..D	AAzz D))
7>D))EB
*Qe$
/
/
/CFaKKqqQWQZHHq X-Hh&L!5)))J44R4$*a]%@@@@BJj	8Qu. . .G #0*aA AK(5EsKK)*6GMM.%s, , KdXq!Z8999|oA'7Ax(a!312 2 2 
~7a<<"2!RW'=>>BBW\\Ha"66B<1GG&G}1===G}) 	%llnnG!||r~R4Z"^3"a.&89Q>K!q(NN"	# 	# 	# 	')H;q!^_Z#7	8 	8 	8
 zzkk'""mCT**y% 	((**CJr   c                     | j         d         }t          j        |d|f|          }t          t          d||           } ||fd|| |f           |S )Nr   rc   rP   compute_correction_factors_sos)rc   )rC   r   rV   r>   IIR_SOS_MODULE)sosrY   r   
n_sectionsr[   r\   s         r   r|   r|   e  sf    1JZH5UCCCJ"8*cK KKKthZ%@AAAr   Tc                    |t          j        | j        |j                  }|                    |          }||                    |          }| j        }| j        }	|j        d         }
t          ||	          }d}||         }d }|	dk    rt          | |          \  } }|t          ||          \  }}|t          j	        | |d          }| j        dk    rdn| j        d         }||z   dz
  |z  }||z  }t          |||          }t          j        |||f|          }|}d }|0t          j        |          }t          j        ||dz   |f|          }t          t          d|          }t          t          d|          }t          t          d	|          }t          t          d
|          }t          t          d|          }t          |d u           }||z
  }|d|z
  z   |z  } |||z  f|f|||||||f           t!          |
          D ])}||         }|9||d d d df         }||d d dd d f<   t#          ||dz
  |          ||d d d df<   |r |||z  f|f||||||||f            ||f|dz  f|||||         ||f           |dk    s|f|)||d d dd f         }||d d dd d f<   ||d d dd d d f<    ||f|f||||||         |f            |||z  f|f|||||||         ||f           |r |||z  f|f|||||||f           |t#          ||dz
  |          ||d d dd f<   +|	dk    rK|                    |          }t          j        |d|          }|j        j        s|                                }|^|                    |          }t/          |          dk    rt          j        |d|          }|j        j        s|                                }|||fS |S )Nr   rc   rM   Tr_   rP   first_pass_iir_sossecond_pass_iir_soscorrect_carries_sosfir_sospick_carriesr@   )r   rd   r   re   rC   rf   r   rK   rN   rg   r|   rV   
empty_liker>   r}   rj   ranger   rD   rB   rE   rF   rG   len) r   r~   rI   rk   r   rY   	apply_firro   rJ   rl   r   rZ   rm   zi_shaperp   rq   rr   r[   rs   all_carrieszi_outrt   ru   rv   
fir_kernelcarries_kernelrw   rx   ry   sb
section_zis                                    r   apply_iir_sosr   n  s@   } #)44
**U

C	~YYugGVF1J v..D	AAHzz D))
7	~'D11H
{j%d333FaKKqqQWQZHHq X-Hh&L/XuEEJj	8Qu. . .GKF	~$$jx!|Q'u6 6 6 ),c3 3)-s4 4.-s4 4!.)SAAJ%nncJJNt__N/O!n"45:NNHx')A4a>&' ' ' : &9 &9F>Aqqq"1"HJ#-K1aaa )#q1ua88F1aaa!8 	.J8+-{ !^X~;-. . . 	Oh!m-q(JqM3@	B 	B 	B a<<2>~111abb\
'1AAAq!!!G$(/AAAqrr111H%##aT8^^qM;01 1 1 O+-{1no"JqM;EF F F
  	$NH$&1nhk#$ $ $
 >)#q1ua88F1aaa8zzkk'""mCT**y% 	((**C	~))x==1]62t44F|( 	#[[]]F	~F{Jr   )r@   NNr]   )r@   NNr]   TN)-	itertoolsr   r   cupy._core.internalr   cupy._core._scalarr   cupy_backends.cuda.apir   cupyx.scipy.signal._arraytoolsr   r   r	   float32float64FLOAT_TYPESint8int16int32int64	INT_TYPES	complex64
complex128COMPLEX_TYPESuint8uint16uint32uint64UNSIGNED_TYPESTYPES
TYPE_PAIRS
TYPE_NAMESTYPE_PAIR_NAMES
IIR_KERNELIIR_SOS_KERNEL	RawModulerW   r}   r>   rK   rN   rQ   rz   r|   r   r   r   r   <module>r      s          5 5 5 5 5 5 + + + + + + * * * * * * 5 5 5 5 5 5
 
 
 |T\4<8Y
DJ
;	1*dk4;Di.0=@< <!6!6 < < <
 /....
OOJOOOY
vYv T^	_3 3"13 3 3BBzBBBC BAjAAAB CBzBBB	CD D D
  	3 3"13 3 3..:.../ 65*5556 54444	5
 65*5556 *)j)))*+ + +        P P P Pf   BF&*q q q q q qr   