
     `i                        d Z ddlmZmZmZ ddlZddlZddlmZ ddl	mc m
Z ddlmZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0  ej1        e2          Z3dFde4fdZ5	 dGdej6        de4de4de4fdZ7dej6        dej8        de4de9de4dej6        fdZ:d dej;        fd!ej6        d"e4d#e4d$e<d%e4d&ej=        de>ej6        ej6        f         fd'Z?d(ej6        d)ee4         dej6        fd*Z@ G d+ d,e'          ZA G d- d.e(          ZB G d/ d0e           ZC G d1 d2e%          ZD G d3 d4e)          ZE G d5 d6e&          ZFe G d7 d8e$                      ZG G d9 d:eG          ZH G d; d<eG          ZI G d= d>eG          ZJ G d? d@eG          ZK G dA dBeG          ZL G dC dDe#          ZMg dEZNdS )Hz<Blt modular model, inheriting from Mllama where appropriate.    )CallableOptionalUnionN   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)OutputRecordercheck_model_inputs   )Cohere2RotaryEmbeddingrotate_half)MllamaForCausalLMMllamaPreTrainedModelMllamaSelfAttentionDecoderLayerMllamaTextCrossAttentionMllamaTextMLPMllamaTextRMSNormMllamaTextSelfAttentioneager_attention_forward   )	BltConfigBltGlobalTransformerConfigBltLocalDecoderConfigBltLocalEncoderConfigBltPatcherConfigʚ;primec                     t          j        |t           j        | j                  }t          j        | j        d         | j                  }||z  }t          j        | |z  d          S )a  
    A polynomial rolling hash algorithm that converts sequences
    of tokens into hash values. The hash is computed as:
        hash = (token_0 * prime^0 + token_1 * prime^1 + ... + token_n * prime^n)

    The rolling hash allows the model to efficiently
    identify and encode recurring byte-level patterns in the input text.

    Args:
        token_tensor (torch.Tensor): [batch_size, seq_len, group_size] containing token IDs to hash
        prime (int): Prime number used as the base for the polynomial hash.

    Returns:
        torch.Tensor: Hash values of shape [batch_size, seq_len] where each value
                     represents the hash of the corresponding token group

    Example:
        >>> tokens = torch.tensor([[1, 2, 3], [4, 5, 6]])
        >>> hashes = rolling_polynomial_hash(tokens, prime=31)
        >>> # hash[0] = 1*31^0 + 2*31^1 + 3*31^2
        >>> # hash[1] = 4*31^0 + 5*31^1 + 6*31^2
    dtypedevicer)   dim)torchtensorint64r)   arangeshapesum)token_tensorr%   prime_tensorpowersprime_powerss        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/blt/modular_blt.pyrolling_polynomial_hashr9   9   sb    . <U[ATUUUL\,,R09LMMMF'L9\L0b9999    0u  	token_ids
group_sizemax_hashc                 L   t          j                    5  | j        \  }}t          j        ||dz
  t           j        | j                  }t          j        || gd          }|                    d|d          }t          ||          }	|	|z  }
ddd           n# 1 swxY w Y   |
S )z1Hash token groups and map to range [0, max_hash].r   r'   r,   N)	r.   no_gradr2   zerosr0   r)   catunfoldr9   )r<   r=   r%   r>   
batch_sizeseq_lenpaddingpadded_tokenswindowshasheshash_valuess              r8   byte_group_hash_functionrK   V   s     
 	( 	('o
G+j*q.T]Tdeee	7I"6A>>>  &&q*a88(%88x'	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( s   A9BB Blocal_encoder_tokensencoder_hash_tok_embedding$encoder_hash_byte_group_nb_functionsencoder_hash_byte_group_sizeencoder_hash_byte_group_vocabreturnc                     g d}|                     |           }d}t          |          D ]L}	||	t          |          z           }
|D ]/}t          | ||
|          }|||z  z   }| ||          z  }|dz  }0M|S )z=Compute token embeddings enhanced with hash-based embeddings.)r$   l   21A ioYl   vt l   . l   }g l   Au l   0 l   T l   AK l   | r   r   )embed_tokensrangelenrK   )rL   local_encoderrM   rN   rO   rP   primes
embeddingsembedding_idxfunc_nbr%   r=   hash_idsoffset_hash_idss                 r8   compute_hash_embeddingsr]   h   s      F ++,@AAJM=>>  wV,-6 	 	J/0DjRWYvwwH&9V)VVO44_EEEJQMM	 r:   F	patch_idsnum_patchessequence_lengthpatches_as_queriescross_attn_kr(   c                    | j         \  }}| j        }|r||z  }	|}
t          j        ||                              d                              d                              |||          }|                     d                              |||          }n|}	||z  }
|                     d                              |||          }t          j        ||                              d                              d                              |||          }||k    }|rdnd}|                    ||          }||	|
f}|j         |k    rt          d|j          d|           |                    d          }d|                    |          z
  }|	                    |                    t          j
                  t          j        |          j                  }|S )	aR  
    Prepare cross-attention mask for patch-based attention, following mllama's robust approach.

    This function creates masks that control which patches can attend to which other patches,
    with support for query/key role swapping and cross-attention multipliers.

    Args:
        patch_ids (torch.Tensor): Tensor of shape [batch_size, seq_len] containing patch ids.
        num_patches (int): Total number of patches.
        sequence_length (int): Length of the sequence.
        patches_as_queries (bool): If True, patches are used as queries, otherwise as keys.
        cross_attn_k (int): Cross-attention multiplier for repeating patches.
        dtype (torch.dtype): Data type for the output mask.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]:
            - cross_attention_mask: 4D tensor [batch_size, 1, q_len, kv_len]
    r+   r   r*   r   r,   zCross attention mask shape z doesn't match expected g      ?)r2   r)   r.   r1   	unsqueezeexpandrepeat_interleave
ValueErrortomasked_fillboolfinfomin)r^   r_   r`   ra   rb   r(   rD   rE   r)   q_lenkv_lenq_patch_idskv_patch_idscross_attention_mask
repeat_dimexpected_shapeinverted_cross_attn_masks                    r8   #_prepare_patch_cross_attention_maskru      s   4 $/JF  
l*  LV444Yq\\Yr]]VJW55	 	 !**1--44ZgVV|+))"--44Z+VVLV444>>qAAKKANNUUV`bikvww 	 ',6 )0bJ/AA,T^A__ !%0N!^33n*>*Dnn^lnn
 
 	

 099!<<  #%9%<%<U%C%CC3?? ##EJ//U1C1C1G   r:   patch_lengthsmax_patch_lengthc                    || S |                      d          }g }| D ]}g }||dk             D ]Y}|                                }t          ||          \  }}|                    |g|z             |r|                    |           Z|                    |           t          d |D                       }	t          j        ||	f| j        | j	                  }
t          |          D ]<\  }}|r5t          j        || j        | j	                  |
|dt          |          f<   =|
dk                        d                                          |
j        d         k     ra|
dk                        d                                                                                                          dz   }|
ddd|f         }
|
S )a  
    Splits patch lengths into smaller segments if they exceed `max_patch_length`.
    Pads the result to uniform length across the batch.

    Args:
        patch_lengths (torch.Tensor): [batch_size, num_patches] tensor of patch lengths.
        max_patch_length (int, optional): Maximum allowed length per patch.

    Returns:
        torch.Tensor: [batch_size, max_len] tensor of split and padded patch lengths.
    Nr   c              3   4   K   | ]}t          |          V  d S N)rU   ).0splitss     r8   	<genexpr>z(process_patch_lengths.<locals>.<genexpr>   s(      66&#f++666666r:   r'   r,   r   )sizeitemdivmodextendappendmaxr.   rA   r(   r)   	enumerater/   rU   anyr3   r2   nonzero)rv   rw   rD   	processedseqr|   lengthfull_chunks	remaindermax_lenpaddedilast_nonzeros                r8   process_patch_lengthsr      s    ##A&&JI ! !#'l 	) 	)F[[]]F%+F4D%E%E"KMM+,{:;;; )i(((     66I66666G[*g.m6IR_RfgggFy)) t t	6 	t',|F-BU^k^r's's'sF1mFm#$ 	!Q##%%Q77!((Q(//7799==??DDFFJ=L=()Mr:   c                       e Zd ZdS )BltMLPN__name__
__module____qualname__ r:   r8   r   r             Dr:   r   c                       e Zd ZdS )
BltRMSNormNr   r   r:   r8   r   r     r   r:   r   c                       e Zd ZdS )BltRotaryEmbeddingNr   r   r:   r8   r   r     r   r:   r   c                   $     e Zd Zdef fdZ xZS )BltTransformerLayer	layer_idxc                    t                                                       t          ||          | _        t	          |          | _        t          |j        |j                  | _	        t          |j        |j                  | _
        d S )N)configr   eps)super__init__BltSelfAttention	self_attnr   mlpr   hidden_sizerms_norm_epsinput_layernormpost_attention_layernormselfr   r   	__class__s      r8   r   zBltTransformerLayer.__init__  sv    )9MMM&>>)&*<&BUVVV(263E6K^(_(_(_%%%r:   )r   r   r   intr   __classcell__r   s   @r8   r   r     sO        `# ` ` ` ` ` ` ` ` ` `r:   r   c            	       j     e Zd Zdedef fdZ	 	 	 ddej        dej        dej        d	ef fd
Z	 xZ
S )r   r   r   c                 Z    t                                          ||           d| _        d S )NT)r   r   	is_causalr   s      r8   r   zBltSelfAttention.__init__  s(    +++r:   FNhidden_statesattention_maskposition_embeddings	use_cachec           
      D     t                      j        d||||||d|S )N)r   r   r   r   past_key_valuescache_positionr   )r   forward)	r   r   r   r   r   r   r   kwargsr   s	           r8   r   zBltSelfAttention.forward#  sE     uww 
') 3+)
 
 
 
 	
r:   )FNN)r   r   r   r   r   r   r.   Tensorrj   r   r   r   s   @r8   r   r     s        y S        
 
|
 
 #\	

 
 
 
 
 
 
 
 
 
 
r:   r   c                        e Zd ZdZddededee         f fdZ	 	 	 	 ddej	        deej	                 d	ee
         d
eej	                 deej                 dee         fdZ xZS )BltCrossAttentionz<Cross-attention module for Blt, following transformers styleNr   r   r   c                     t                                                       d| _        t          | j        |j                  | _        t          | j        |j                  | _        d S )NFr   )r   r   r   r   r   r   q_normk_norm)r   r   r   r   r   s       r8   r   zBltCrossAttention.__init__;  sX     !1v7JKKK !1v7JKKKr:   r   cross_attention_statesr   r   r   r   c                 `   |                                 \  }}}	|                     |          }
|                     |
          }
|
                    ||| j        | j                                      dd          }
||                     |          }|                     |          }| 	                    |          }|                    |d| j
        | j                                      dd          }|                    |d| j
        | j                                      dd          }|"|                    ||| j        d|i          \  }}nJ|d         dk    r/|j        | j                 j        |j        | j                 j        }}nt!          d          t"          }| j        j        dk    rt(          | j        j                 } || |
|||f| j        sdn| j        | j        d	|\  }}|                    ||d                                          }|                     |          }||z   }||fS )
Nr   r   r*   r   r   z^Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!eagerg        )dropoutscaling)r~   r   q_projview	num_headshead_dim	transposer   k_projv_projnum_key_value_headsupdater   layerskeysvaluesrg   r   r   _attn_implementationr   trainingr   r   reshape
contiguouso_proj)r   r   r   r   r   r   r   bszrm   _query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                   r8   r   zBltCrossAttention.forwardA  sS    &**,,UA{{=11{{<00#((eT^T]SS]]^_abcc!-%)[[1G%H%H"%;<<J;;'=>>L#b$2JDMZZddefhijjJ',,S"d6NPTP]^^hhijlmnnL*+:+A+Adn?OQ_>`, ,(
L A!##&t~6;&t~6= %JJ
 p   )@;+w66"9$+:Z"[$7$7	%
  $}>CC$,L	%
 	%
 	%
 	%
!\ "))#ub99DDFFkk+..!M1L((r:   rz   NNNN)r   r   r   __doc__r   r   r   r   r.   r   r   
LongTensorr   r   r   r   r   s   @r8   r   r   8  s        FFL Ly LS LxPS} L L L L L L :>+/15593) 3)|3) !) 63) "%	3)
 !.3) !!123) +,3) 3) 3) 3) 3) 3) 3) 3)r:   r   c                   x    e Zd ZU eed<   dZdZdZdgZ e	e
dd           e	edd          dZd	 Zd
 Zd ZdS )BltPreTrainedModelr   Fr   r   local_decoderindex
layer_namer   )r   
attentionsc                      t          d          NzNo need to inherit it!AttributeErrorr   modules     r8   _init_weightsz BltPreTrainedModel._init_weights      5666r:   c                      t          d          r   r   r   s     r8   _update_causal_maskz&BltPreTrainedModel._update_causal_mask  r   r:   c                      t          d          r   r   r   s     r8   5_prepare_4d_causal_attention_mask_with_cache_positionzHBltPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position  r   r:   N)r   r   r   r   __annotations___supports_attention_backend_supports_flash_attn_supports_flex_attn_no_split_modulesr   r   r   _can_record_outputsr   r   r   r   r:   r8   r   r   w  s         "' ./'(;1Q`aaa$n%5Q?[[[ 
7 7 77 7 77 7 7 7 7r:   r   c                   t    e Zd ZU eed<   d eedd          iZdef fdZ	 	 	 	 	 	 	 	 	 	 dde	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e         de	e
j                 de	e
j                 de	e         de	e
j                 dee         fdZd Z xZS )BltLocalEncoderr   encoder_attentionsr   rV   r   c                    t                                                     d| _        | _        t	          j        fdt          j                  D                       | _        t                    | _
        t	          j        j        j        j        z  d          | _        t	          j        j        j                  | _        t	          j                    | _        j        rj        nd}t          |          D ]1}| j                            t+          |j                             2|                                  d S )NFc                 0    g | ]}t          |          S r   r   r{   r   r   s     r8   
<listcomp>z,BltLocalEncoder.__init__.<locals>.<listcomp>  $    eee	 33eeer:   r   in_featuresout_featuresbiasr   r   r   r   )r   r   gradient_checkpointingr   nn
ModuleListrT   num_hidden_layersr   r   
rotary_embLinearr   rb   patch_embedding_projection	Embedding
vocab_sizerS   cross_attn_layerscross_attn_all_layersr   r   	post_initr   r   layers_to_addr   r   s    `  r8   r   zBltLocalEncoder.__init__  sI      &+#meeeeU6KcEdEdeee
 
 -F;;;*,)*+f.AA+
 +
 +
'
 L):F<NOO!#4:4PW00VW}-- 	 	I"))!9RXRdeee    	r:   N	input_idsinputs_embedspatch_embedsr   position_idsr   r   encoder_attention_maskr_   r^   r   c           	      ~   ||                      |          }|j        d         }t          j        || j        j        | j                  }|Mt          j        |j        d         |j                  	                    d          
                    |d          }|                     ||          }t          j        || j        j        | j                  }t          | j                  D ]\  }} ||f||||d|}|t          | j                  dz
  k    s| j        j        r|                     ||	|
          }|                     |          }|                    ||j        d         | j        j        z  | j        j                  }| j        j        r|nd} | j        |         d|||d|\  }}||z   }|}||fS )	Nr   pr   r   r+   r*   r   r   r   r   r   r   r   r   )rS   r2   Fr   r   r   r.   r1   r)   rd   re   r  r   r   rU   r  patch_reducer  r   rb   r   r  )r   r  r  r  r   r  r   r   r   r_   r^   r   rD   r   r   idxlayerr   cross_attention_outputr   encoder_cross_statess                        r8   r   zBltLocalEncoder.forward  s      --i88M"(+
	-4;3FQUQ^___]03M<PQQQ[[\]^^eefprtuu  #oom\JJ	-4;3FQUQ^___#DK00 	E 	EJC!E$7- /-   M c$+&&***dk.O*#00YWW#>>|LL+33 21 58P PRVR]Ri    $(;#DKCC!	,MD,B9,M -".+8#9- - 	- -)&  ,.DD+222r:   c                 B   |j         d         }|j         d         }|                    d                              dd|j         d                   }t          j        |||f|j        |j                  }|                    |d|dd          }|ddd|ddf         }|S )	a  
        Reduce variable length patches to single embedding per patch
        Note: this works with variable number of patches for different sequences in the batch
        It handles variable length patches by assuming that patch_lengths will be 0 for any
        extra patches on the *right*. Since there can be a variable number of patches
        this function also return the number of patches for each sequence in the batch.
        Any embeddings on the right that are not allocated to a patch
        (i.e. if the sum(patch_lengths[i]) < seq_len for any i)
        will be sent to a dummy patch, which is trimmed before returning.
        r   r*   r'   r   amaxF)srcr-   r   reduceinclude_selfN)r2   rd   re   r.   rA   r(   r)   scatter_reduce)r   r   max_num_patchesr^   rD   embedding_dimreduced_embeddingss          r8   r'  zBltLocalEncoder.patch_reduce  s     #(+
%+B/''++222r=;Nr;RSS	"[-8@S\i\p
 
 
 0>> ? 
 
 03CO3CQQQ0FG!!r:   
NNNNNNNNNN)r   r   r   r"   r   r   r   r   r   r   r.   r   r   r   r   r   r   r   r'  r   r   s   @r8   r   r     sq        !!!!nn-=QSbccc4      2 1504/31537+/599=%),043 43E,-43  -43 u|,	43
 !.43 u/043 "%43 !!1243 !) 643 c]43 EL)43 +,43 43 43 43l" " " " " " "r:   r   c                   0    e Zd ZU eed<   def fdZe	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej
                 deej
                 deej	                 d	ee         d
eej	                 deej
                 dee         fd            Z xZS )BltLocalDecoderr   c                    t                                                     d| _        | _        d| _        t          j        fdt          j                  D                       | _	        t                    | _        t          j        j        j        j        z  d          | _        t#          j        j                  | _        t          j                    | _        j        rj        nd}t          |          D ]1}| j                            t/          |j                             2|                                  d S )	NFTc                 0    g | ]}t          |          S r   r  r  s     r8   r  z,BltLocalDecoder.__init__.<locals>.<listcomp>	  r  r:   r  r	  r   r   r  )r   r   r  r   cross_attn_decoderr  r  rT   r  r   r   r  r  hidden_size_globalr   rb   r  r   r   normr  r  r   r   r  r  s    `  r8   r   zBltLocalDecoder.__init__  sQ      &+#"&meeeeU6KcEdEdeee
 
 -F;;;*,)1+f.AA+
 +
 +
'
 v1v7JKKK	!#4:4PW00VW}-- 	 	I"))!9RXRdeee    	r:   Nr  r  r  r   r  r   r   r   r   c	           	         |j         d         }
|}|                     |          }|                    |
|j         d         | j        j        z  | j        j                  }|| j        s||z   }|Mt          j        |j         d         |j	                  
                    d                              |
d          }|                     ||          }t          j        || j        j        | j                  }t!          | j                  D ]D\  }}|dk    s| j        j        r | j        |         d|||d|	\  }}||z   } ||f||||d|	}E|                     |          }|S )	Nr   r   r+   r*   r"  r%  r$  r   )r2   r  r   r   rb   r   r:  r.   r1   r)   rd   re   r  r&  r   r   r   r   r  r  r<  )r   r  r  r  r   r  r   r   r   r   rD   r   r   r   r)  r*  r   logitss                     r8   r   zBltLocalDecoder.forward  s    #(+
%66|DD#++*1-0HH$+Ja
 
 #D,C#)L8M]03M<PQQQ[[\]^^eefprtuu  #oom\JJ	-4;3FQUQ^___!$+.. 	 	HAuAvv:v,ED,B1,E -"/+7#9- - 	- -)& !.0F F!E$7- /-   MM =))r:   NNNNNNNN)r   r   r   r!   r   r   r   r   r.   r   r   r   r   r   r   r   r   s   @r8   r7  r7     s)        !!!!4      0  1504/31537+/599=0 0E,-0  -0 u|,	0
 !.0 u/00 "%0 !!120 !) 60 +,0 0 0 0 0 0 0 0r:   r7  c                        e Zd ZU eed<   d eedd          iZdef fdZ	 	 	 	 dde	j
        d	ee	j
                 d
ee	j                 dee         dee	j                 dee         fdZ xZS )BltGlobalTransformerr   global_attentionsr   global_transformerr   c                    t                                          |           || _        t          j                    | _        t          |j                  D ]*}| j                            t          ||                     +t          |          | _        t          |dd           't          j        |j        |j        d          | _        nt          j                    | _        |                                  d S )Nr  encoder_cross_output_sizeFr  )r   r   r   r  r  r   rT   r  r   r   r   r  getattrr  rE  r   token_embedding_projectionIdentityr  r   s      r8   r   zBltGlobalTransformer.__init__U  s       moov788 	G 	GIK269EEFFFF,F;;; 66==I.0i0&2D5/ / /D++ /1kmmD+r:   Ninput_embedsr   r  r   r   r   c           	         |j         \  }}}	|                     |          }
t          j        |
| j        j        | j                  }
|Mt          j        |j         d         |j                  	                    d          
                    |d          }|                     |
|          }t          | j                  D ]\  }} ||
f||||d|}
|
S )Nr"  r   r+   r   r*   r$  )r2   rH  r&  r   r   r   r.   r1   r)   rd   re   r  r   r   )r   rJ  r   r  r   r   r   rD   rE   r   r   r   r   r)  s                 r8   r   zBltGlobalTransformer.forwardg  s     ".!3
GQ77EE	-4;3FQUQ^___\/2<;NOOOYYZ[\\ccdnprss  #oom\JJ!$+.. 	 	HAu!E$7- /-   MM r:   r   )r   r   r   r    r   r   r   r   r   r.   r   r   r   r   r   r   r   r   r   s   @r8   rA  rA  O  s         &&&&^^,<ARfggg9      * 2637+/59 l !. u/0	
 "% !!12 +,       r:   rA  c                   L    e Zd ZU eed<   def fdZ	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 dee
         deej                 d	ee         d
eej                 dee         dee         dee         dee         fdZe	 	 dd            Z xZS )
BltPatcherr   c                 P   t                                          |           t          | j                  | _        t          j                    | _        t          | j        j	                  D ]/}| j        
                    t          | j        |                     0t          j        | j        j        | j        j                  | _        t!          | j        j        | j        j                  | _        t          j        | j        j        | j        j        d          | _        d S )Nr  r   FrF  )r   r   r   r   r  r  r  r   rT   r  r   r   r  r  r   rS   r   r   r<  r  lm_headr   s      r8   r   zBltPatcher.__init__  s       ,DK@@@moot{<== 	L 	LIK24;	JJKKKKL)?AXYYt{6DK<TUUU	yK#K"
 
 
r:   Nr  r   r  r   r  r   r   
patch_size	thresholdrw   r   c                 N   |d u |d uz  rt          d          ||                     |          }|r|t                      }|B||                                nd}t	          j        |||j        d         z   |j                  }||                    d          }t          | j
        |||||          }|}|                     ||          }| j        D ]} ||||          }|                     |                     |                    }t          j                            |                                          }|j        d d         \  }}||                     ||||		          }n#t	          j        ||f|j        |j        
          }t+          ||
          }|||fS )N:You must specify exactly one of input_ids or inputs_embedsr   r   r+   r   rJ  r   r   r   r  )r   r   )r>  r   )	entropiesr`   rP  rQ  r'   )rg   rS   r   get_seq_lengthr.   r1   r2   r)   rd   r	   r   r  r   rO  r<  distributionsCategoricalentropypatch_lengths_from_entropiesonesr(   r   )r   r  r   r  r   r  r   r   rP  rQ  rw   r   past_seen_tokenscausal_maskr   r   r)  r>  prediction_entropiesrD   r`   rv   s                         r8   r   zBltPatcher.forward  s    -t";< 	[YZZZ  --i88M 	-0*nnO!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L(;&))+%
 
 
 &"oom\JJ[ 	v 	vE!E-EXituuuMMdii6677$2>>f>MMUUWW&3&9"1"&=#
O! ==. /%#	 >  MM "J_-]5HQ^Qe  M .m=MNN#]F::r:   c                    | j         d         }t          j        ddgt          j        | j                                      d                              |d          }|j         d         }| ddddf         } | |k    }|j         d         }t          j        || j                                      d                              |d          }	t          j	        |	|          }
t          j
        |	|
gd          }t          j
        || gd          }||                             ||          }|                    d                                          }|ddd|f         }t          j
        |||z   fd          }t          j	        |ddddf         |dz
            }t          j
        |ddddf         dz
  |fd          }||z
  dz   }|S )z
        Computes patch lengths from token entropies.

        Depending on whether a threshold is provided, the function uses either:
        - Thresholding the entropy values (when `threshold` is set).
        r   r   r'   Nr+   r*   r,   )r2   r.   r/   longr)   rd   repeatr1   re   	full_likerB   r   r3   r   )rU  r`   rP  rQ  rD   init_tokensoffset
patch_maskrE   token_indicessentinelpadded_indicespadded_maskpatch_startsmax_valid_patchespatch_start_ids
last_token
patch_endsrv   s                      r8   rZ  z'BltPatcher.patch_lengths_from_entropies  s    _Q'
 L!Quz):JKKKUUVWXX__`jlmnn 	 "1% aaae$	 *
"1% WY5EFFFPPQRSSZZ[egijj?='::M8#<!DDD ij[ 9qAAA &k2:::wOO&NNqN115577#AAA'9(9'9$9:  )[,2G$HaPPP __QQQU%;_q=PQQ
Y122 6 :JGQOOO
"_4q8r:   r5  )NN)r   r   r   r#   r   r   r   r.   r   r   r   FloatTensorrj   r   floatr   r   r   staticmethodrZ  r   r   s   @r8   rM  rM    sb        
/ 
 
 
 
 
 
  151537+/59$(59$(%)*.?; ?;E,-?; !.?; u/0	?;
 "%?;   12?; D>?; !!12?; SM?; E??; #3-?; +,?; ?; ?; ?;B  	3 3 3 \3 3 3 3 3r:   rM  c                   R    e Zd Zdef fdZe	 	 	 	 	 	 	 	 ddeej                 deej	                 deej	                 deej                 dee
         d	eej                 d
ee         deej                 dee         defd            Zd Zd Zdej	        dedej	        fdZ xZS )BltModelr   c                    t                                          |           d| _        || _        t	          |j                  | _        t          |j                  | _	        t          |j                  | _        |j        t          |j                  z  }|j        |z  }t#          j        ||j        j                  | _        | j        j        rVt-          |j                  | _        | j                                         | j                                        D ]	}d|_        
nd | _        |                                  d S )NF)r   r   r  r   r   encoder_configrV   rA  global_configrC  r7  decoder_configr   rN   rU   rO   rP   r  r  r   rM   patch_in_forwardrM  patcher_configpatchereval
parametersrequires_gradr  )r   r   num_embeddingstotal_vocab_sizeparamr   s        r8   r   zBltModel.__init__  s#      &+#,V-BCC"6v7K"L"L,V-BCCDs6KnGoGoo!?.P*,,7GI^Ij*k*k';' 	 %f&;<<DLL0022 , ,&+##,  DLr:   Nr  rv   r   r  r   r  r   r   r   rQ   c	                 0   |d u |d uz  rt          d          ||}
|j        \  }}}nF|j        \  }}t          || j        | j        | j        j        | j        j        | j        j                  }
|| j        j	        dk    re| j
        ^|t          d          | 
                    || j        j        | j        j        | j        j        | j        j        |j                  \  }}}nT||j        n|j        }||j        n|j        }t#          t%          j        ||dz   f||          | j        j                  }|                     ||          }|B||                                nd}t%          j        |||
j        d         z   |
j                  }||                    d          }t1          | j        |
||||	          }t3          ||j        d         |d
| j        j        |
j                  } | j        d||
||||j        d         |d|	\  }}|                    ||j        d         d          }t%          j        d|j        d         |j                  }|                    d          }t1          | j        |d |d d 	          } | j        d|||d|	}|                     |d d dd f         |          }t3          ||j        d         |d| j        j        |
j                  } | j        d||||||||d|	}t=          ||          S )NrS  rY  z0input_ids is required for entropy-based patching)rP  rQ  rw   patching_batch_sizer)   r   r'   r   r+   rT  T)r^   r_   r`   ra   rb   r(   )r  r  r   r  r   r_   r^   r*   )rJ  r   r  F)r  r  r  r   r  r   r   r   )last_hidden_stater   r   )rg   r2   r]   rV   rM   r   rN   rO   rP   patching_moderz  rP  patching_thresholdrw   r  r)   r(   r   r.   r[  _patch_ids_from_lengthsrV  r1   rd   r	   ru   rb   r   rC  r   r
   )r   r  rv   r   r  r   r  r   r   r   encoder_embedsrD   r`   r   r)   r(   r^   r\  r]  cross_attn_mask_encencoder_hidden_statesr+  global_cache_positionglobal_position_idsglobal_causal_maskglobal_hidden_statesdecoder_patch_idscross_attn_mask_decoutputs                                r8   r   zBltModel.forward"  s    -t";< 	[YZZZ $*N-:-@*J*3/'J4"/@89 N  {(I55$,:R$$%WXXX&*ll#{5"k<%)[%A(,(G$+ '3 ' '#=!! .7-B))H\+4+@	mFY 5J
Oa,?@V\]]]K0! ! 00PP	!CRC^==???de"\ "2^5I!5L"LUcUj  N )33A66L(;'))+%
 
 
 B%+A.+#1 &
 
 
 7Id6H 	7
(&%#6%+A.	7
 	7
 	7
 	7
33  488]EXYZE[]_`` %Q0D0J10MVjVq r r r3==a@@/;-0 
 
 
  7t6  
--, 
  
 	 
  
 !88qqq!""u9M__A'%+A.+$1 &
 
 
 $# 

/-&%+)#6

 

 

 

 '$+
 
 
 	
r:   c                     | j         j        S rz   rV   rS   )r   s    r8   get_input_embeddingszBltModel.get_input_embeddings  s    !..r:   c                     || j         _        d S rz   r  )r   values     r8   set_input_embeddingszBltModel.set_input_embeddings  s    */'''r:   rE   c                    |j         d         }t          j        t          j        |d|j        |j                  |                    d          d d d df         gd          }t          j        ||j                  }|                    d          |                    d                              d          k    	                    d          dz
  S )Nr   r   r'   r*   r,   r+   )
r2   r.   rB   rA   r(   r)   cumsumr1   rd   r3   )r   rv   rE   rD   rj  token_positionss         r8   r  z BltModel._patch_ids_from_lengths  s    "(+
yJ1D]Mabbb$$$,,QQQV4 
 
 
  ,w}7KLLL&&q))_-F-Fq-I-I-S-STV-W-WW\\ac\ddghhhr:   r?  )r   r   r   r   r   r   r   r.   r   r   r   ro  rj   r   r   r
   r   r  r  r   r  r   r   s   @r8   rs  rs    s       y      (  15041537+/59$(59~
 ~
E,-~
  -~
 !.	~

 u/0~
 "%~
   12~
 D>~
 !!12~
 +,~
 
!~
 ~
 ~
 ~
@/ / /0 0 0
iU\ 
iC 
iTYT` 
i 
i 
i 
i 
i 
i 
i 
ir:   rs  c                       e Zd ZU eed<   dZdZdgZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	ee
j        e
j        f                  de	eeee
j                 f                  de	e
j                 de	e
j                 de	e         de	e
j                 deee
j        f         dee         deeef         fdZ xZS )BltForCausalLMr   Fmodelzlm_head.weightc                    t                                          |           |j        | _        t          |          | _        t          j        |j        j        |j        d          | _	        | 
                                 d S )NFrF  )r   r   r  rs  r  r  r  rw  r   rO  r  )r   r   r   s     r8   r   zBltForCausalLM.__init__  sl        +f%%
y!6!BFDU\abbbr:   Nr   r  r   r  r   rq   full_text_row_masked_out_maskr   r  labelsr   r   logits_to_keepr   rQ   c                 n    | j         d||||||||
|d	|}|j        }t          |t                    rt	          | d           n|}|                     |d d |d d f                                                   }d }|	 | j        ||	| j        fi |}t          |||j
        |j        |j                  S )N)	r  r   r  rq   r  r   r  r   r   )lossr>  r   r   r   r   )r  r  
isinstancer   slicerO  rp  loss_functionr  r   r   r   r   )r   r  r   r  r   rq   r  r   r  r  r   r   r  r   outputsr   slice_indicesr>  r  s                      r8   r   zBltForCausalLM.forward  s   " $* 
)%!5*G+')
 
 
 
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AAGGII%4%ffdoPPPPD%#3!/)
 
 
 	
r:   )NNNNNNNNNNNr   )r   r   r   r   r   _can_compile_fullgraphbase_model_prefix_tied_weights_keysr   r   r.   r   r   tupler   r   listro  rj   r   r   r   r   r   r   r   s   @r8   r  r    s        "*+y       151537=A;?UYKO59-1$(5934,
 ,
E,-,
 !.,
 u/0	,

 !))9 :,
 'u'78,
 (0elEL6P0Q'R,
 "%tE4E/F(F"GH,
   12,
 )*,
 D>,
 !!12,
 c5</0,
 +,,
 
u,,	-,
 ,
 ,
 ,
 ,
 ,
 ,
 ,
r:   r  )r   rs  rM  r  )r$   )r   r$   r;   )Or   typingr   r   r   r.   torch.distributionstorch.nnr  torch.nn.functional
functionalr&  cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   r   cohere2.modeling_cohere2r   r   mllama.modeling_mllamar   r   r   r   r   r   r   r   configuration_bltr   r    r!   r"   r#   
get_loggerr   loggerr   r9   r   rK   r  r  r]   float32rj   r(   r  ru   r   r   r   r   r   r   r   r   r   r7  rA  rM  rs  r  __all__r   r:   r8   <module>r     s   C B , , , , , , , , , ,                     . . . . . . . . / / / / / / O O O O O O O O 5 5 5 5 5 5 & & & & & & @ @ @ @ @ @ @ @ @ @ ? ? ? ? ? ? ? ?       	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	              
	H	%	%: : : : : :< \a |),9<UX   $#,# !## +.	#
 #'# $'# \# # # #T  %K  K |K K  K  	K 
 K  ;K  5<%&K  K  K  K \) )RU )[`[g ) ) ) )X	 	 	 	 	] 	 	 		 	 	 	 	" 	 	 		 	 	 	 	/ 	 	 	` ` ` ` `9 ` ` `
 
 
 
 
. 
 
 
4<) <) <) <) <)0 <) <) <)~ 7 7 7 7 7. 7 7 7*p" p" p" p" p"( p" p" p"fL L L L L( L L L^2 2 2 2 2- 2 2 2jF F F F F# F F FRfi fi fi fi fi! fi fi fiR9
 9
 9
 9
 9
& 9
 9
 9
x  r:   