
     `i5                        d Z ddlmZmZ ddlZddlmZ ddlmZm	Z	 ddl
mZmZmZ  e            rddlmZ dd	lmZmZmZ  e	j        e          Z G d
 d          Z	 d#dej        dej        dej        deej        eej        ej        f         f         fdZeej        ef         Z	 	 	 	 	 d$dej        dee         deeeef                  dee         ddf
dZdej        dedej        fdZ	 	 	 	 d%dej         j!        dej        dej        dej        deej        df         dee"         dee"         d eej                 d!eej                 deej        eej                 f         fd"Z#dS )&a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalUnionN)version   )is_torch_flex_attn_availablelogging)_torch_versionis_torch_less_or_equalis_torchdynamo_compiling)_DEFAULT_SPARSE_BLOCK_SIZE)	BlockMaskcreate_block_maskflex_attentionc                   |     e Zd ZdZdZdZdZ fdZej	        
                    d          d             Zd Z xZS )WrappedFlexAttentionzh
    We are doing a singleton class so that flex attention is compiled once when it's first called.
    NFc                 l    | j         &t                                          |           | _         | j         S N)	_instancesuper__new__)clsargskwargs	__class__s      |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/integrations/flex_attention.pyr   zWrappedFlexAttention.__new__7   s*    = !GGOOC00CM}    )	recursivec                 r   | j         r|| j        k    r|| _        t          d          r!t          j        t
          d          | _        ndt          j        t                    j
        dk    r$|r"t          j        t
          dd          | _        nt          j        t
                    | _        d| _         dS dS )	z>
        Initialize or update the singleton instance.
        2.5.1F)dynamicz2.6.0zmax-autotune-no-cudagraphs)r    modeTN)_is_flex_compiledtrainingr
   torchcompiler   _compiled_flex_attentionr   parser	   base_version)selfr#   s     r   __init__zWrappedFlexAttention.__init__=   s    
 % 	*T])B)B$DM%g.. N05nV[0\0\0\-- ~..;wFF8F05"E8T1 1 1--
 16n0M0M-%)D""" *C)Br   c                     | j         S r   )r&   )r)   s    r   __call__zWrappedFlexAttention.__call__S   s    ,,r   )__name__
__module____qualname____doc__r   r"   r&   r   r$   compilerdisabler*   r,   __classcell__)r   s   @r   r   r   .   s          I#     ^e,,* * -,**- - - - - - -r   r   Fquerykeyvaluereturnc                 p    t                      s t          |                      nt          } || ||fi |S r   )r   r   r   )r4   r5   r6   r#   r   flex_attention_compileds         r   compile_friendly_flex_attentionr:   W   s\     G_F`F`t<28<<>>>ft""  	  r   Tattention_mask_2dattention_chunk_sizeoffsets	is_causalr   c                 v     j         \  }}|s|}|s|}|t          z  dz   t          z  }t          j        j                             dd||z
  f            j        }	                                 |@                                                    d          	                    d          dz
  |z   fdfd}
 fd}|s|n|n|
|>|d         
                    |	          |d         
                    |	          fd	}n}t          ||d|||	t          d
                     S )aG  
    IMPORTANT NOTICE: This function is deprecated in favor of using the mask primitives in `masking_utils.py`,
    and will be removed in a future version without warnings. New code should not use it. It is only kept here
    for BC for now, while models using it are being patched accordingly.

    Create a block (causal) document mask for a batch of sequences, both packed and unpacked.
    Create Block (causal) logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full (causal) block
    mask. BlockMask is essential for performant computation of flex attention.
    See: https://pytorch.org/blog/flexattention/

    Args:
        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
        of shape (batch_size, total_seq_len). e.g.

        For unpacked sequence:
        [[1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0]]

        For packed sequence:
        [[1, 1, 1, 2, 2, 2, 0],
         [1, 1, 2, 2, 2, 3, 3]]

    Returns:
        BlockMask
       r   )r6   padNc                 l    ||k    }	| |f         	| |f         k    }| |f         dk    }||z  |z  }|S )z
        Defines the logic of a block causal mask by combining both a standard causal mask
        and a block diagonal document mask.
        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
        for an illustration.
        r    )
	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_maskr;   document_idss
           r   causal_mask_modz4make_flex_block_causal_mask.<locals>.causal_mask_mod   sV     vo$Y%56,yRXGX:YY(E)9:Q> </-?
r   c                 V    | |f         | |f         k    } | |||          }||z  S )zU
        Combines the chunk mask with the causal mask for chunked attention.
        rD   )rE   rF   rG   rH   
chunk_maskcausal_doc_maskrN   
chunk_idxss         r   chunk_causal_mask_modz:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod   sC      	5 01Z	6@Q5RR
)/)XufMMO++r   c                 Z    | |f         | |f         k    }| |f         dk    }||z  }|S )zp
        Utilizes default attention mask to enable encoder and encoder-decoder
        attention masks.
        r   rD   )	rE   rF   rG   rH   rJ   rK   rL   r;   rM   s	          r   default_mask_modz5make_flex_block_causal_mask.<locals>.default_mask_mod   sH    
 %Y%56,yRXGX:YY(F):;a?!M1
r   c                 4    |z   }|z   } | |||          S r   rD   )	rE   rF   rG   rH   offset_q	offset_kv	kv_offsetmask_mod_maybe_combinedq_offsets	         r   mask_modz-make_flex_block_causal_mask.<locals>.mask_mod   s.    x'H*I**9h)TTTr   r   )r\   BHQ_LENKV_LENdevice_compile)shapeflex_default_block_sizer$   nn
functionalrA   ra   clonefill_cumsumtor   r
   )r;   r<   query_length
key_lengthr=   r>   
batch_sizetotal_seq_lenpad_lenra   rS   rU   r\   rN   rR   rM   rY   rZ   r[   s   `            @@@@@@r   make_flex_block_causal_maskrp   m   s   D !2 7J #"
 %$55:>UUG+//0AQRT[^hThPi/jj%F$**,,L'"((**0033::2>>BH\]
     , , , , , ,	 	 	 	 	 	  m"25I5Q//Wl1:==((AJMM&))		U 	U 	U 	U 	U 	U 	U 	U
 +

+G444	 	 	 	r   hidden_statesn_repc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r@   N)rc   expandreshape)rq   rr   batchnum_key_value_headsslenhead_dims         r   	repeat_kvrz      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr   moduleattention_maskscalingsoftcap	head_masks_auxc	                    t                               d           |	                    dd          dk    rt          d          d }
d t	          |t
                    r|}
n|d d d d d d d |j        d         f         fd}d}|j        d	         }||d	z
  z  dk    rTt          ||j        d	         |j        d	         z            }t          ||j        d	         |j        d	         z            }d
}|	                    d          }|j        j	        dk    }|s|t          d          t          |||||
||||| j        
  
        }|r|\  }}|                    |j                  }||j        \  }}}}|                    d	dd	d	                              |||d	          }|                    d          }t#          j        t#          j        ||gd          dd          }t#          j        ||z
            }||z  }n|}d }|                    d	d                                          }||fS )Nzm`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature.dropoutg        r   z`flex_attention` does not support `dropout`. Please use it with inference only (`model.eval()`) or turn off the attention dropout in the respective config.c                     t          j        | z            z  } | |         d         |         |         z   } | |         |         d         d         z   } | S )Nr   )r$   tanh)scorerE   rF   rG   rH   r   
score_maskr~   s        r   	score_modz)flex_attention_forward.<locals>.score_mod  sp    ej999E!Jy1!4U;FCCE Ii0:1=a@@E r   Tr@   Fkernel_optionscpuzhAttention sinks cannot be run on CPU with flex attention. Please switch to a different device, e.g. CUDA)r   
block_mask
enable_gqascaler   
return_lser#   rB   )dim)r   keepdimr   )loggerwarning_onceget
ValueError
isinstancer   rc   rz   ra   typer:   r#   rj   dtypeviewrt   	unsqueezer$   	logsumexpcatexp	transpose
contiguous)r{   r4   r5   r6   r|   r}   r~   r   r   r   r   r   r   num_local_query_headsr   r   flex_attention_outputattention_outputlserm   	num_heads	seq_len_q_sinkslse_expandedcombined_lserenorm_factorr   s         ``                   @r   flex_attention_forwardr      s    {	
 	
 	
 zz)S!!A%%a
 
 	

 JJ.),, $#

#
111aaa39R= 89

 
 
 
 
 
 
 J!KN 	!6!:;AAU[^sy|;<<%Q5;q>!ABB
ZZ 011N"e+J 
%+v
 
 	
 <%      5#ffU[!!2B2H/J	9aJJq"a++22:y)UVWWE
 ==,,L ?59lE5JPR+S+S+SY[eijjjL "Il\&ABBM/-?0'11!Q77BBDDS  r   )F)NNNNT)NNNN)$r0   typingr   r   r$   	packagingr   utilsr   r   utils.import_utilsr	   r
   r   !torch.nn.attention.flex_attentionr   rd   r   r   r   
get_loggerr-   r   r   Tensortupler:   intOffsetboolrp   rz   re   Modulefloatr   rD   r   r   <module>r      s    8 # " " " " " " "        9 9 9 9 9 9 9 9 a a a a a a a a a a  !! _gggggg^^^^^^^^^^ 
	H	%	%&- &- &- &- &- &- &- &-Z 	 <	 < 5<u|U\9::;   $ 
u|S 	! +//3 $o o|o"3-o
 eFFN+,o ~o o o o od	UU\ 	U# 	U%, 	U 	U 	U 	U$  $#(,$(e! e!HOe!<e! 
e! <	e!
 %,34e! e_e! e_e! %e! EL!e! 5<%,//0e! e! e! e! e! e!r   