
    .`i5                        d dl Z d dlmZ d dlmZmZ ej        dej        dej        dej        fd            Z G d d	          Z		 d-de j
        de j
        dede	dedee j
        e j
        f         fdZ	 	 d.de j
        de j
        dede	dz  fdZ	 	 	 d/de j
        de j
        dede	dz  def
dZ	 	 	 d/de j
        de j
        dede	dz  def
dZej        dej        dej        dej        dej        dej        dej        fd             Z ed!           d"d"fd#e j
        d$e j
        d%ed&ed'ede j
        fd(Zej        d)ej        dej        dej        dej        dej        f
d*            Z	 	 d0d+e j
        d$e j
        d&ed'ede j
        f
d,ZdS )1    N)GroupCoordinator)tltritonHEAD_DIM	N_ROUNDED	IS_BASE_Ec                 n   t          j        d                              t           j                  }t          j        d                              t           j                  }t          j        d|          }t          j        d|          }||z  ||z  z   ||	z  z   }t          j        ||z             }t          j        ||k    |t          d          k    z  t          d           |          }t          j        |d          }t          j        |t          d           k    d|          }||z  }|r?t          j	        |          }t          j
        |d          }t          j        |          }n>t          j        |          }t          j
        |d          }t          j        |          }||z  }||z  ||	z  z   }t          j        ||z   |           ||z  ||z  z   ||z  z   }|
|z  ||z  z   ||	z  z   }t          j        ||z             }||z
  }t          j        ||k    |t          d          k    z  t          d           |          }|rt          j	        |          nt          j        |          }t          j        | |z             }||z  }t          j        ||z   |           dS )aB  
    Apply the all-gathered lses to correct each local rank's attention
    output. we still need perform a cross-rank reduction to obtain the
    final attention output.

    Args:
        outputs_ptr (triton.PointerType):
            Pointer to input tensor of shape [ B, H, D ]
        lses_ptr (triton.PointerType):
            Pointer to input tensor of shape [ N, B, H ]
        new_output_ptr (triton.PointerType):
            Pointer to output tensor of shape [ B, H, D ]
        vlse_ptr (triton.PointerType):
            Pointer to output tensor of shape [ B, H ]
    r   )axis   infN)r   
program_idtoint64arangeloadwherefloatmaxexpsumlogexp2log2store)outputs_ptrnew_output_ptrlses_ptrvlse_ptroutputs_stride_Boutputs_stride_Houtputs_stride_Dlses_stride_Nlses_stride_Blses_stride_Hlse_idxr   r   r   	batch_idxhead_idx	d_offsetsnum_n_offsetslse_offsetslselse_maxlse_explse_accoutput_offsets
lse_offsetlse_tmplse_finallyfactoroutputs                                p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/ops/common.py_correct_attn_cp_out_kernelr6   	   s   @ 1%%%((22I}!$$$''11H	!X&&IIa++M 	%
m
#	$
]
"	#  '([(
)
)C
(C3J3%,,#67%,,
L
LCfSq!!!Ghw5<<-/G<<G7NC &++&q)))fWoo'#,,&q)))gg7NCm+h.FFKHX#S))) 	$$
%
%	&
&
&	'  	-)m";;h>VV  gh+,,GC-K(		#uU||(CD	u K
 %.GRVK   27;3G3GFW[>122Ff_FH^n,f55555    c                       e Zd ZdZd Zd ZdS )CPTritonContextzEThe CPTritonContext is used to avoid recompilation of the Triton JIT.c                     d | _         d S Ninner_kernel)selfs    r5   __init__zCPTritonContext.__init__d   s     r7   c                 ^    | j          ||         |i || _         d S  | j         |         |  d S r;   r<   )r>   kernelgridregular_args
const_argss        r5   call_kernelzCPTritonContext.call_kernelg   sG    $ ,tl Ij I ID#Dd#\2222r7   N)__name__
__module____qualname____doc__r?   rE    r7   r5   r9   r9   a   s8        OO! ! !3 3 3 3 3r7   r9   Toutlsescp_rankctxis_lse_base_on_ereturnc                 2   |t                      }| j        dk    r&| j        d         dk    r|                     d          } | j        dk    sJ dt	          | j                               |j        dk    r&|j        d         dk    r|                    d          }|j        dk    r&|j        d         dk    r|                    d          }|j        dk    sJ dt	          |j                               | j        \  }}}|j        d         }|                                 \  }	}
}|                                \  }}}t          j        ||f||f|j        |j	        	          }||df}| | |||	|
|||||f}|||d
} |j
        t          |g|R i | | |fS )ae  Correct the attention output using the all-gathered lses.

    Args:
        out: Tensor of shape [ B, H, D ]
        lses: Tensor of shape [ N, B, H ]
        cp_rank: Current rank in the context-parallel group
        ctx: Triton context to avoid recompilation

    Returns:
        Tuple of (out, lse) with corrected attention and final log-sum-exp.
    N   r      z'expected out [B,H,D] or [B,1,H,D], got zAexpected lses [N,B,H] (optionally with a 1-sized extra dim), got r   devicedtype)r   r   r   )r9   ndimshapesqueezetuplestridetorchempty_stridedrV   rW   rE   r6   )rK   rL   rM   rN   rO   BHDNo_sBo_sHo_sDl_sNl_sBl_sHr+   rB   rC   rD   s                      r5   correct_attn_outri   n   s   $ { x1}}1**kk!nn8q===VE#)DTDTVV===yA~~$*R.A--||ByA~~$*Q-1,,||A9>>>	#TZ  	# 	# >>
 iGAq!
1A
 zz||D${{}}D$ 
	
AtT[
  C
 q!9D 	L  a>NOOJCO/SSSS
SSS8Or7   cp_attn_outcp_attn_lsecp_groupc                 V   |j         dk    r| S |t                      }t          j        |j         f|j        z   |j        |j                  }|                                }|                    |d          	                    |          }t          | ||j        ||          \  }}||fS )<
    cp_attn_out: [ B, H, D ]
    cp_attn_lse: [ B, H ]
    r   N)rW   rV   r   dim)rO   )
world_sizer9   r]   emptyrY   rW   rV   
contiguous
all_gatherview_asri   rank_in_group)rj   rk   rl   rN   rO   rL   rK   r+   s           r5   _cp_lse_commonrw      s     a
{;		!22!  D ((**K{22::4@@D)  HC 8Or7   F
return_lsec                     t          | ||||          \  }}|                    |d          }|r7|j        d         |j        z  }|j        }	|dd||	z  ||	dz   z  f         }||fS |S )rn   rN   rO   r   ro   N)rw   reduce_scatterrY   rq   rv   )
rj   rk   rl   rN   rx   rO   rK   r+   cp_num_headsrM   s
             r5   cp_lse_ag_out_rsr}      s     [(FV  HC 
!
!#1
!
-
-C y|x'::(!!!\G+lgk.JJJKCxJr7   c                 j    t          | ||||          \  }}|                    |          }|r||fS |S )rn   rz   )rw   
all_reduce)rj   rk   rl   rN   rx   rO   rK   r+   s           r5   cp_lse_ag_out_arr      sS     [(FV  HC 

c
"
"C CxJr7   rb   ra   Lmax	PAD_VALUEBLOCK_TBLOCK_Dc	                 ^   t          j        d          }	t          j        d          }
t          j        d          }|
|z  t          j        d|          z   }||z  t          j        d|          z   }d}t          |	          D ]}|t          j        ||z             z  }t          j        ||	z             }||k     }||z   }||k     |z  }| |d d d f         |z  z   |d d d f         z   }||	|z  |z   d d d f         |z  z   |d d d f         z   }|d d d f         |k     }t          j        ||g|t           j                  }t          j        |||d d d f         |z             t          j        ||d d d f         |z            }t          j        |||d d d f         |z             d S Nr   r      )mask)r   r   r   ranger   fullfloat32r   )x_ptrout_ptrlengths_ptrrb   ra   r   r   r   r   pid_bpid_tpid_doff_toff_din_startiseq_lent_maskin_row	valid_row	x_row_ptrout_row_ptrd_maskpad_valsx_valss                            r5   _pack_seq_kernelr     s    M!EM!EM!EGObi7333EGObi7333E H5\\ - -BGK!O,,,gkE)**G T\F FF*I qqq$w!++eD!!!Gn<I UT\E1111d7;a??%aaa.PK 47^aFw)9bjAAHH[(46)ABBBB WYYqqq$w%7&%@AAAFH[&yD'9F'BCCCCCCr7   r   @   xlengths	pad_valueblock_tblock_dc                    | j         }t          |          dk    r,|d         }|                     |d          }|j         d         }n| j         \  }}| }|                                }	t	          |                                                                          }
t          j        |	|
|f| j	        | j
                  }|	t          j        |
|          t          j        ||          f}t          |         |||                                |||
t          |          ||dd           t          |          dk    r$|	|
f|dd         z   }|                    |          }|S )	a  
    Pack sequences of different lengths into a batched tensor.

    Args:
        x: [N, ...] - input tensor where N is total number of tokens
        lengths: [B] - sequence lengths for each batch
        pad_value: value to use for padding
        block_t: block size for time dimension
        block_d: block size for feature dimension

    Returns:
        packed: [B, Lmax, ...] - packed tensor
    r   r   rT   r   rU   rR   )r   r   r   	num_warps
num_stagesN)rY   lenreshapenumelintr   itemr]   rr   rV   rW   r   cdivr   r   )r   r   r   r   r   original_shaperb   
x_reshapedra   r_   r   rK   rB   output_shapes                 r5   pack_seq_tritonr   7  s]   , WN
>Q1YYq"%%
Qw1
Aw{{}}!!##$$D +q$l1817
C
C
CCv{4))6;q'+B+BCDT			""    >Q4y>!""#55kk,''Jr7   r_   c                    t          j        d          }t          j        d          }	t          j        d          }
|	|z  t          j        d|          z   }|
|z  t          j        d|          z   }d}t          |          D ]}|t          j        ||z             z  }t          j        ||z             }||k     }||k     |z  }||z   }| ||z  |z   d d d f         |z  z   |d d d f         z   }||d d d f         |z  z   |d d d f         z   }|d d d f         |k     }t          j        ||d d d f         |z            }t          j        |||d d d f         |z             d S r   )r   r   r   r   r   r   )
packed_ptrr   r   r_   r   ra   r   r   r   r   r   r   r   r   r   r   r   r   out_rowpacked_row_ptrr   r   packed_valss                          r5   _unpack_seq_triton_kernelr   t  s    M!EM!EM!EGObi7333EGObi7333E H5\\ - -BGK!O,,,gkE)**G T\FF*I G  54<%#7D"AA"EEdTUTUTUgVN GAAAtG,q005qqq>AK 47^aF'.yD/AF/JKKKKH[+Iaaag,>,GHHHHHHr7   packed_tensorc                    | j         }t          |          dk    r2|dd         \  }}|                     ||d          }|j         d         }n| j         \  }}}| }t          |                                                                          }	t          j        |	|f| j        | j	                  }
|t          j        ||          t          j        ||          f}t          |         ||
|                                |||||dd
  
         t          |          dk    r#|	f|dd         z   }|
                    |          }
|
S )a  
    Unpack a packed decode query tensor back to the original format.
    Efficient Triton implementation.

    Args:
        packed_tensor: [B, Lmax, ...] - packed tensor from pack_seq_triton
        lengths: [B] - sequence lengths for each batch
        block_t: block size for time dimension
        block_d: block size for feature dimension

    Returns:
        unpacked_tensor: [N, ...] where N = sum(lengths)
    rS   Nr   rT   rU   rR   )r   r   r   r   )rY   r   r   r   r   r   r]   rr   rV   rW   r   r   r   )r   r   r   r   r   r_   r   packed_reshapedra   rb   rK   rB   r   s                r5   unpack_seq_tritonr     sS   * #(N
>Q !$4'//4<<!!$"(
4' 	GKKMM  !!A
+q!f]%9AT
U
U
UCv{4))6;q'+B+BCDd#		    >QtnQRR00kk,''Jr7   )T)NT)NFT)r   r   )r]   vllm.distributed.parallel_stater   vllm.triton_utilsr   r   jit	constexprr6   r9   Tensorr   boolr[   ri   rw   r}   r   r   r   r   r   r   rJ   r7   r5   <module>r      s    < < < < < < ( ( ( ( ( ( ( ( T6 lT6 |T6 |T6 T6 T6 T6n
3 
3 
3 
3 
3 
3 
3 
3$ "D D	D
,D D 
	D
 D 5<%&D D D DV #'        
4		       N #'   
4		
    : #'   
4		
    , ,D 
|	,D
 
|,D ,,D |,D \,D \,D ,D ,D ,Dd e}: :|:\: : 	:
 : \: : : :z 'I 
|	'I
 ,'I 
|'I \'I \'I 'I 'I 'IZ 	6 6<6\6 6 	6
 \6 6 6 6 6 6r7   