
     `iY'                       d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	m
Z
 d dlm
c mZ d dl	mZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3  ed           G d de
j4                              Z5 G d de
j4                  Z6 G d de
j4                  Z7 G d de
j4                  Z8 G d de
j4                  Z9 G d  d!e
j4                  Z:d" Z;d#ej<        d$ej<        d%ej<        d&ej<        d'e=ej<        ej<        f         f
d(Z>d)ej<        d*e?d'ej<        fd+Z@	 dVd-e
j4        d.ej<        d/ej<        d0ej<        d1eej<                 d2eAd3eAd4e(e*         fd5ZB G d6 d7e
j4                  ZC G d8 d9e          ZD G d: d;e
j4                  ZEd< ZFdWd=ZG G d> d?e
j4                  ZH G d@ dAe
j4                  ZI G dB dCe          ZJe e+dDE           G dF dGe                                   ZKe+ G dH dIe&                      ZL G dJ dKeL          ZMe+ G dL dMeL                      ZNe+ G dN dOeL                      ZOe e+dPE           G dQ dRe                                   ZP G dS dTeLe          ZQg dUZRdS )X    N)	dataclass)AnyCallableOptionalUnion)	LayerNorm   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compiling)check_model_inputs   )Glm4vConfigGlm4vTextConfigGlm4vVisionConfigRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Glm4vRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z;
        Glm4vRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/glm4v/modeling_glm4v.pyr(   zGlm4vRMSNorm.__init__0   sD     	l5:k#:#:;; #    c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )N   Tkeepdim)	dtypetor+   float32powmeanrsqrtr.   r-   )r/   hidden_statesinput_dtypevariances       r3   forwardzGlm4vRMSNorm.forward8   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r4   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler-   shaper.   r/   s    r3   
extra_reprzGlm4vRMSNorm.extra_repr?   s&    )**II$2GIIIr4   )r%   )__name__
__module____qualname__r(   rC   rH   __classcell__r2   s   @r3   r$   r$   .   sb        $ $ $ $ $ $; ; ;J J J J J J Jr4   r$   c                   ,     e Zd Zddef fdZd Z xZS )Glm4VisionMlpFbiasc                    t                                                       |j        | _        |j        | _        t          j        | j        | j        |          | _        t          j        | j        | j        |          | _        t          j        | j        | j        |          | _	        t          |j                 | _        d S NrP   )r'   r(   r0   out_hidden_sizeintermediate_sizer)   Linear	gate_projup_proj	down_projr
   
hidden_actact_fn)r/   configrP   r2   s      r3   r(   zGlm4VisionMlp.__init__D   s    !-!'!74#3T5KRVWWWy!143IPTUUU4#94;KRVWWWV./r4   c                     |                      |                     |                     |                    |                     |          z            S N)rY   r[   rW   rX   r/   hidden_states     r3   rC   zGlm4VisionMlp.forwardM   s>    ~~dkk$..*F*FGG$,,WcJdJddeeer4   F)rI   rJ   rK   boolr(   rC   rL   rM   s   @r3   rO   rO   C   s_        0 0T 0 0 0 0 0 0f f f f f f fr4   rO   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )Glm4vVisionPatchEmbedr\   returnNc                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        | j        | j        | j        g}t          j        | j        | j        ||          | _	        d S )N)kernel_sizestride)
r'   r(   
patch_sizetemporal_patch_sizein_channelsr0   	embed_dimr)   Conv3dproj)r/   r\   rg   r2   s      r3   r(   zGlm4vVisionPatchEmbed.__init__R   sz     +#)#= !-+/$/RId.K`klll			r4   r@   c                    | j         j        j        }|                    d| j        | j        | j        | j                  }|                      |                    |                                        d| j                  }|S )Nr7   r:   )	rn   r-   r:   viewrk   rj   ri   r;   rl   )r/   r@   target_dtypes      r3   rC   zGlm4vVisionPatchEmbed.forward\   sw    y'-%** $":DOT_
 
 		-"2"2"2"F"FGGLLRQUQ_``r4   	rI   rJ   rK   r!   r(   r+   TensorrC   rL   rM   s   @r3   rd   rd   Q   sz        m0 mT m m m m m mU\ el        r4   rd   c                   ^     e Zd ZU ej        ed<   d
dededdf fdZdedej        fd	Z	 xZ
S )Glm4vVisionRotaryEmbeddinginv_freq     @dimthetare   Nc                     t                                                       d|t          j        d|dt          j                  |z  z  z  }|                     d|d           d S )N      ?r   r6   rp   rw   F
persistent)r'   r(   r+   arangefloatregister_buffer)r/   ry   rz   rw   r2   s       r3   r(   z#Glm4vVisionRotaryEmbedding.__init__h   sd    %ELC%+$N$N$NQT$TUVZeDDDDDr4   seqlenc                     t          j        || j        j        | j        j                  }t          j        || j                  }|S )Ndevicer:   )r+   r   rw   r   r:   outer)r/   r   seqfreqss       r3   rC   z"Glm4vVisionRotaryEmbedding.forwardm   s:    l6$-*>dmFYZZZC//r4   )rx   )rI   rJ   rK   r+   rt   __annotations__intr   r(   rC   rL   rM   s   @r3   rv   rv   e   s         lE EC E ED E E E E E E
c el        r4   rv   c                   Z     e Zd Zddededededdf
 fdZd	ej        dej        fd
Z	 xZ
S )Glm4vVisionPatchMergerFry   context_dimrZ   rP   re   Nc                    t                                                       t          j        |||          | _        t          |          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _	        t          j
                    | _        t          |         | _        d S rR   )r'   r(   r)   rV   rn   r   post_projection_normrW   rX   rY   GELUact1r
   r[   )r/   ry   r   rZ   rP   r2   s        r3   r(   zGlm4vVisionPatchMerger.__init__t   s    Ic3T222	$-cNN!3$???yk===;$???GII	Z(r4   r`   c                    |                      |          }|                     |                     |                    }|                     |                     |                     |                    |                     |          z            S r^   )rn   r   r   rY   r[   rW   rX   r_   s     r3   rC   zGlm4vVisionPatchMerger.forward~   sn    yy..yy!:!:<!H!HII~~dkk$..*F*FGG$,,WcJdJddeeer4   ra   )rI   rJ   rK   r   strrb   r(   r+   rt   rC   rL   rM   s   @r3   r   r   s   s        ) )C )c )s )$ )[_ ) ) ) ) ) )fEL fU\ f f f f f f f fr4   r   c                   :     e Zd Zdef fdZdej        fdZ xZS )Glm4vVisionEmbeddingsr\   c                    t                                                       || _        |j        | _        |j        | _        |j        | _        | j        | j        z  dz  | _        | j        | _        t          j
        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )Nr6   position_ids)r   r7   Fr}   )r'   r(   r\   r0   rl   
image_sizeri   num_patchesnum_positionsr)   	Embeddingposition_embeddingr   r+   r   expandr/   r\   r2   s     r3   r(   zGlm4vVisionEmbeddings.__init__   s    + + + Ot>1D!-"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr4   re   c                    | j         j        }|j        d         }|j        d         }|j        }	|                    |	          |                    |	          }}|dk    rt          j        d||	|j                  }
nt          t                    r!t          j
        |	t
          j                  t          t
          j                  s!t          j
        |	t
          j                  |j        d         }t          |dz            }|                    |||                              ddd                              d                              |	t
          j                  }t          j        fdt'          t)                              D                                           |	t
          j                  }t          j        fdt'          t)                              D                                           |	t
          j                  }|                    |	t
          j                  }|                    |	t
          j                  }|dz   |z  dz  dz
  }|dz   |z  dz  dz
  }t          j        ||fd	                              d                              d          }t-          j        ||d
dd          }|                    d                              d                              dd          }|                    |j                                      |j                  }
||
z   }|S )a  
        Forward pass with integrated position encoding adaptation using 2D interpolation.

        Args:
            embeddings: Input embeddings tensor
            lengths (torch.Tensor): Sequence lengths for each image in the batch.
            image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w).
            h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch.
            w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch.

        Returns:
            torch.Tensor: Embeddings with adapted position encoding added.
        r   r   r   g      ?r6   c                 V    g | ]%}|d f                              |                   &S r   repeat.0iimage_shapeslengthss     r3   
<listcomp>z1Glm4vVisionEmbeddings.forward.<locals>.<listcomp>   4    !e!e!eA,q!t"4";";GAJ"G"G!e!e!er4   c                 V    g | ]%}|d f                              |                   &S )r6   r   r   s     r3   r   z1Glm4vVisionEmbeddings.forward.<locals>.<listcomp>   r   r4   r7   ry   bicubicFborder)modealign_cornerspadding_mode)r   r-   rF   r   r;   r+   emptyr:   
isinstancelisttensorlongrt   r   rq   permute	unsqueezer<   catrangelenstackFgrid_samplesqueeze)r/   
embeddingsr   r   h_coordsw_coordspos_embed_weightr0   	total_seqr   adapted_pos_embedorig_size_sq	orig_sizepos_embed_2dtarget_htarget_wnorm_wnorm_hgridinterpolated_embed_fp32adapted_pos_embed_fp32s     ``                 r3   rC   zGlm4vVisionEmbeddings.forward   s(     29&,Q/N1%	!( &[[00(++f2E2E( >> %A{6QaQg h h h '4(( Q,wvUZPPPlEL99 [$|LuzZZZ ,1!4LL#-..I %%iKHHAq!!1677	  y!e!e!e!e!eQVWZ[bWcWcQdQd!e!e!effiiU] j  H y!e!e!e!e!eQVWZ[bWcWcQdQd!e!e!effiiU] j  H
  {{&{FFH{{&{FFH#~1Q6:F#~1Q6:F ;/R888BB1EEOOPQRRD '(md%V^' ' '#
 &=%D%DQ%G%G%O%OPR%S%S%[%[\]_`%a%a" 6 9 9:J:P Q Q T TU_Uf g g  "33
r4   rs   rM   s   @r3   r   r      st        
q0 
q 
q 
q 
q 
q 
qGPUP\ G G G G G G G Gr4   r   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )*Rotates half the hidden dims of the input..Nr7   r6   r   )rF   r+   r   xx1x2s      r3   rotate_halfr      s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r4   qkcossinre   c                    | j         }|j         }|                                 |                                }} |                    d                                          |                    d                                          }}| |z  t          |           |z  z   }||z  t          |          |z  z   }|                    |          }|                    |          }||fS )N)r:   r   r   r   r;   )r   r   r   r   orig_q_dtypeorig_k_dtypeq_embedk_embeds           r3   apply_rotary_pos_emb_visionr      s     7L7L7799aggiiqA}}R  &&((#--*;*;*A*A*C*CC3w;q>>C/0G3w;q>>C/0Gjj&&Gjj&&GGr4   r@   n_repc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rF   r   reshape)r@   r   batchnum_key_value_headsslenhead_dims         r3   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr4           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr6   r	   r   r7   ry   r:   )ptrainingr   )r   num_key_value_groupsr+   matmul	transposerF   r)   
functionalsoftmaxr<   r;   r:   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r3   eager_attention_forwardr     s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r4   c                        e Zd Zdeddf fdZ	 	 d
dej        dej        deej                 deeej        ej        f                  dej        f
d	Z	 xZ
S )Glm4vVisionAttentionr\   re   Nc                    t                                                       |j        | _        |j        | _        | j        | j        z  | _        d| _        t          j        |j        |j        dz  |j	                  | _
        t          j        |j        |j        d          | _        | j        dz  | _        || _        |j        | _        d| _        d S )Nr   r	   rS   F      )r'   r(   r0   ry   	num_headsr   r   r)   rV   attention_biasqkvrn   r   r\   attention_dropout	is_causalr   s     r3   r(   zGlm4vVisionAttention.__init__  s    %)DN2$%!9V/1Ca1GfNcdddIf0&2D5QQQ	}d*!'!9r4   r@   
cu_seqlensrotary_pos_embposition_embeddingsc                 N    |j         d         }                     |                              |d j        d                              dddd                              d          \  }}}	|\  }
}t          |||
|          \  }}|                    dd                              d          }|                    dd                              d          }|	                    dd                              d          }	t           j
        j        dk    rt           j
        j                  j
        j        dk    rS|dd          |d d         z
                                  }  |||	fd  j         j        sdn j        ||||d	d
\  }}nS|dd          |d d         z
  fd|||	fD             } fdt#          | D             }t%          j        |d          }|                    |d                                          }                     |          }|S )Nr   r	   r7   r   r6   eagerflash_attention_2r   F)r   r   r   cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kr  c                 b    g | ]+}t          j        |                                d           ,S )r6   r   )r+   splittolist)r   r   r   s     r3   r   z0Glm4vVisionAttention.forward.<locals>.<listcomp>P  sA       AGFGNN$4$4!<<<  r4   c           
      l    g | ]0\  }}} |||fd j         j        sdnj        ddd         1S )Nr   F)r   r   r   r  r   )r   r   r
  )r   r   r   vattention_interfacer   r/   s       r3   r   z0Glm4vVisionAttention.forward.<locals>.<listcomp>T  s        Aq! $#	

 $( L'+}PCC$:P#
 
 
 
 
  r4   r   )rF   r	  r   r  r   unbindr   r   r   r  r\   _attn_implementationr   maxr   r   r
  zipr+   r   r   rn   )r/   r@   r  r  r  r   
seq_lengthquery_statesr   r   r   r   
max_seqlenr  _splitsattn_outputsr  r   s   `    `           @@r3   rC   zGlm4vVisionAttention.forward$  s    #(+
HH]##++J4>2NNVVWXZ[]^`abbiijkll 	/j, 'S#>|ZY\^a#b#b j#--a33==a@@))!Q//99!<<
#--a33==a@@(?;+w66"9$+:Z"[;+/BBB$QRR.:crc?:??AAJ00	
  $#'=Ld6L((''   NK" !nz#2#6G   LXZdfrKs  F       #F|  L  )La888K!))*b99DDFFii,,r4   NN)rI   rJ   rK   r!   r(   r+   rt   r   rE   rC   rL   rM   s   @r3   r  r    s        0 T      " 26KOB B|B LB !.	B
 &eEL%,,F&GHB 
B B B B B B B Br4   r  c                        e Zd Zd	 fdZ	 	 d
dej        dej        deej                 deeej        ej        f                  dej        f
dZ xZ	S )Glm4vVisionBlockre   Nc                    t                                                       t          |j        |j                  | _        t          |j        |j                  | _        t          |          | _        t          |d          | _
        d S )Nr1   FrS   )r'   r(   r$   r0   rms_norm_epsnorm1norm2r  attnrO   mlpr   s     r3   r(   zGlm4vVisionBlock.__init__j  st    !&"4&:MNNN
!&"4&:MNNN
(00	 e444r4   r@   r  r  r  c                     | | j         |                     |          f|||d|z   }||                     |                     |                    z   }|S )N)r  r  r  )r.  r,  r/  r-  )r/   r@   r  r  r  r   s         r3   rC   zGlm4vVisionBlock.forwardq  st     &		JJ}%%)
!) 3	)
 )

 )
 )
 
 &M1J1J(K(KKr4   re   Nr&  )
rI   rJ   rK   r(   r+   rt   r   rE   rC   rL   rM   s   @r3   r(  r(  i  s        5 5 5 5 5 5 26KO | L !.	
 &eEL%,,F&GH 
       r4   r(  c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )Glm4vTextRotaryEmbeddingrw   Nr\   c                    t                                                       t          |d          r@|j        9|j                            d|j                            d                    | _        nd| _        |j        | _        |j        | _        || _	        t          | j                 | _        |                     | j	        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultrw   Fr}   )r'   r(   hasattrr5  getr6  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr\   r   rope_init_fnattention_scalingr   rw   original_inv_freq)r/   r\   r   rw   r2   s       r3   r(   z!Glm4vTextRotaryEmbedding.__init__  s    6>** 	'v/B/N#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r4   c                 2   | j         d d d d d f                                                             d|j        d         dd          }|d d d d d d d f                                         }t	          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j	        |d          5  |                                |                                z  
                    dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr	   r   r7   mpscpuF)device_typeenabledr6   r   rp   )rw   r   r   rF   r   r   r7  r   r+   autocastr   r   r   r?  r   r;   r:   )
r/   r   r   inv_freq_expandedposition_ids_expandedrD  r   embr   r   s
             r3   rC   z Glm4vTextRotaryEmbedding.forward  s   
 !M$aaa*=>DDFFMMaQ]QcdeQfhjlmnn ,QQQ4] ; A A C C'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   >BEEEr^   )rI   rJ   rK   r+   rt   r   r    r(   no_gradr   rC   rL   rM   s   @r3   r3  r3    s         l/ / / / / / / /" U]__< <  _< < < < <r4   r3  c                     | ddddf         }| ddddf         }t          j        | |fd                              d          S )	r   .r   Nr6   r   r7   r   r   )r+   r   flattenr   s      r3   rotate_half_llmrM    sQ    	
319B	
319B;Ryb)))11"555r4   c           	      ^   |dz  }t          j        d t          |                    |d                    D             d                              |          }t          j        d t          |                    |d                    D             d                              |          }|dd|j        d         dz  f                             dd          }|dd|j        d         dz  f                             dd          }|j        d         }| dd|f         | d|df         }}|dd|f         |d|df         }
}	||z  t          |          |z  z   }|	|z  t          |	          |z  z   }t          j        ||gd          }t          j        ||
gd          }||fS )aX  Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).

    Explanation:
        Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
        sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
        vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
        Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
        For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
        height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
        difference with modern LLMs.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        mrope_section(`List(int)`):
            Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r6   c                 *    g | ]\  }}||d z           S r	    r   r   ms      r3   r   z3apply_multimodal_rotary_pos_emb.<locals>.<listcomp>  $    VVV$!QQq1uXVVVr4   r7   r   c                 *    g | ]\  }}||d z           S rP  rQ  rR  s      r3   r   z3apply_multimodal_rotary_pos_emb.<locals>.<listcomp>  rT  r4   .N)r+   r   	enumerater  r   rF   repeat_interleaverM  )r   r   r   r   mrope_sectionunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passr   r   s                r3   apply_multimodal_rotary_pos_embr_    s   : "A%M
)VV)CIImQSI4T4T*U*UVVV\^
_
_
_
i
i C )VV)CIImQSI4T4T*U*UVVV\^
_
_
_
i
i C
 c'SYr]a'''
(
:
:1"
:
E
EC
c'SYr]a'''
(
:
:1"
:
E
EC 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{u55;<Gs{u55;<G i&)r222Gi&)r222GGr4   c                   H    e Zd ZdZddedee         f fdZ	 	 	 	 ddej	        de
ej	        ej	        f         deej	                 d	eej                 d
ee         deej                 dee         de
ej	        eej	                 ee
ej	                          f         fdZ xZS )Glm4vTextAttentionz
    Multi-headed attention from 'Attention Is All You Need' paper.
    and "Generating Long Sequences with Sparse Transformers".
    Nr\   	layer_idxc                    t                                                       || _        || _        |j        | _        |j        | _        | j        | j        z  | _        |j        | _        | j        | j        z  | _	        d| _
        |j        | _        |j        | _        | j        dz  | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        z  | j        d          | _        d S )NTr  rS   F)r'   r(   r\   rb  r0   num_attention_headsr  r   r   r   r  r
  r5  r   r)   rV   q_projk_projv_projo_projr/   r\   rb  r2   s      r3   r(   zGlm4vTextAttention.__init__  s/   "!-3(DN:#)#= $(Nd6N$N!!'!9"/}d*i 0$.4=2PW[\\\i 0$2JT]2Zaefffi 0$2JT]2Zaefffi >@PW\]]]r4   r@   r  r   r   past_key_valuescache_positionr   re   c                    |                                 \  }}	}
|                     |          }|                     |          }|                     |          }|                    ||	d| j                                      dd          }|                    ||	d| j                                      dd          }|                    ||	d| j                                      dd          }|\  }}t          ||||| j        d                   \  }}|&|||d}|	                    ||| j
        |          \  }}t          }| j        j        dk    rt          | j        j                 } || ||||f| j        sdn| j        | j        d|\  }}|                    ||	d                                          }|                     |          }||fS )	Nr7   r   r6   rX  )r   r   rk  r  r   )r   r   )sizere  rf  rg  rq   r   r   r_  r5  updaterb  r  r\   r  r   r   r
  r   r   r   rh  )r/   r@   r  r   r   rj  rk  r   bszq_lenr#  r!  r   r   r   r   cache_kwargsr  r  r   s                       r3   rC   zGlm4vTextAttention.forward  s    &**,,UA{{=11[[//
{{=11#((eRGGQQRSUVWW__S%T]CCMMaQRSS
#((eRGGQQRSUVWW&S#B*c30A/0R$
 $
 j &#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ "))#ub99DDFFkk+..L((r4   r^   NNNN)rI   rJ   rK   __doc__r    r   r   r(   r+   rt   rE   
LongTensorr   r   r   rC   rL   rM   s   @r3   ra  ra    s.        
^ ^ ^8C= ^ ^ ^ ^ ^ ^0 2637+/59.) .)|.) #5<#=>.) !.	.)
 u/0.) "%.) !!12.) -..) 
u|Xel3XeEL>Q5RR	S.) .) .) .) .) .) .) .)r4   ra  c                   B     e Zd Z fdZdej        dej        fdZ xZS )Glm4vTextMLPc                 "   t                                                       || _        t          j        |j        d|j        z  d          | _        t          j        |j        |j        d          | _        t          |j
                 | _        d S )Nr6   FrS   )r'   r(   r\   r)   rV   r0   rU   gate_up_projrY   r
   rZ   activation_fnr   s     r3   r(   zGlm4vTextMLP.__init__6  sz    If&8!f>V:V]bccc6#;V=OV[\\\#F$56r4   r@   re   c                     |                      |          }|                    dd          \  }}||                     |          z  }|                     |          S )Nr6   r7   r   )rx  chunkry  rY   )r/   r@   	up_statesgates       r3   rC   zGlm4vTextMLP.forward>  sX    %%m44	#//!/44i 2 24 8 88	~~i(((r4   )rI   rJ   rK   r(   r+   FloatTensorrC   rL   rM   s   @r3   rv  rv  5  s`        7 7 7 7 7)U%6 )5;L ) ) ) ) ) ) ) )r4   rv  c                   @    e Zd Zdedef fdZ	 	 	 	 	 	 ddej        deej        ej        f         de	ej                 d	e	ej
                 d
e	e         de	e         de	e         de	ej
                 deej        e	eej        ej        f                  f         fdZ xZS )Glm4vTextDecoderLayerr\   rb  c                    t                                                       |j        | _        t          ||          | _        t          |          | _        t          |j        |j                  | _	        t          |j        |j                  | _
        t          |j        |j                  | _        t          |j        |j                  | _        d S )Nr*  )r'   r(   r0   ra  	self_attnrv  r/  r$   r+  input_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernormri  s      r3   r(   zGlm4vTextDecoderLayer.__init__H  s    !-+FI>>''+F,>FDWXXX(4V5GVM`(a(a(a%(4V5GVM`(a(a(a%".v/AvGZ"["["[r4   NFr@   r  r   r   rj  output_attentions	use_cacherk  re   c	                 &   |}
|                      |          } | j        d||||||||d|	\  }}|                     |          }|
|z   }|}
|                     |          }|                     |          }|                     |          }|
|z   }|S )N)r@   r  r   r   rj  r  r  rk  rQ  )r  r  r  r  r/  r  )r/   r@   r  r   r   rj  r  r  rk  r   residualr#  s               r3   rC   zGlm4vTextDecoderLayer.forwardR  s     !,,];; *4> 

' 3)%+/)

 

 

 

q 55mDD =0 !55mDD////>> =0r4   )NNNFFN)rI   rJ   rK   r    r   r(   r+   rt   rE   r   rt  r   rb   r~  rC   rL   rM   s   @r3   r  r  G  s-       \ \3 \ \ \ \ \ \ 2637+/,1$)59' '|' #5<#=>' !.	'
 u/0' "%' $D>' D>' !!12' 
u (51BEDU1U+V"WW	X' ' ' ' ' ' ' 'r4   r  zJ
    Base class for Llava outputs, with hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	ee
         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed<   dS )Glm4vModelOutputWithPasta[  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
        The rope index difference between sequence length and multimodal rope.
    Nlast_hidden_staterj  r@   
attentionsrope_deltas)rI   rJ   rK   rs  r  r   r+   r~  r   rj  r   r@   rE   r  r  rt  rQ  r4   r3   r  r  |  s           6:x 12999'+OXe_+++8<M8E%"345<<<59Ju012999.2K%*+22222r4   r  c                   H    e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZdZeedZdS )	Glm4vPreTrainedModelr\   modelTr  r(  rj  )r@   r  N)rI   rJ   rK   r   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr  ra  _can_record_outputsrQ  r4   r3   r  r    sf         &*#02DE"3N!"&.( r4   r  c                   j     e Zd ZU eed<   dgZd
 fdZd Zdej	        dej	        dej	        fd	Z
 xZS )Glm4vVisionModelr\   r(  re   Nc                    t                                                     j        | _        j        | _        t	                    | _        t                    | _        j        j	        z  }t          |dz            | _        t          j        fdt          j                  D                       | _        t#          j        j        j                  | _        t-          j        j                  | _        t          j        j        j        j        j                  | _        t-          j        j                  | _        d| _        |                                  d S )Nr6   c                 .    g | ]}t                    S rQ  )r(  )r   r#  r\   s     r3   r   z-Glm4vVisionModel.__init__.<locals>.<listcomp>  s"    $[$[$[!%5f%=%=$[$[$[r4   )ry   r   rZ   r*  )rk   out_channelsrg   rh   F)r'   r(   spatial_merge_sizeri   r   r   rd   patch_embedr0   r  rv   r  r)   
ModuleListr   depthblocksr   rT   rU   rZ   mergerr$   r+  post_conv_layernormConv2d
downsamplepost_layernormgradient_checkpointing	post_init)r/   r\   r   r2   s    ` r3   r(   zGlm4vVisionModel.__init__  sS      "("; +/77088%)998QGGm$[$[$[$[uV\GZGZ$[$[$[\\,&F4LY_Yj
 
 
 $00BH[#\#\#\ )*/1,	
 
 
 +6+=6CVWWW&+#r4   c                    g }|D ]x\  }}}t          j        |                              d                              d|          }|                    || j        z  | j        || j        z  | j                  }|                    dddd          }|                                }t          j        |                              d                              |d          }|                    || j        z  | j        || j        z  | j                  }|                    dddd          }|                                }|                    t          j	        ||gd          
                    |d                     zt          j        |d          }|d d dd f                                         }|                     |          }	|	|                             d          }
|
|fS )Nr   r7   r   r6   r	   r   )r+   r   r   r   r   r  r   rL  appendr   r   r   r  r  )r/   grid_thwpos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr  s              r3   rot_pos_embzGlm4vVisionModel.rot_pos_emb  s    	S 	SGAq!|A0033::2qAAH''T,,'T,,'	 H  ''1a33H''))H|A0033::1bAAH''T,,'T,,'	 H  ''1a33H''))HNN5;(';DDDKKAqQQRRRR)G+++ ABB++--"11-@@,W5==a@@w&&r4   r@   r  c           	      H   |                      |          }|                     |          }|                     |          \  }}t          j        ||fd          }|                                |                                f}t          j        |dddf         |dddf         z  |dddf                                       dt          j	        
                                r|j        nt          j                  }t          j        |dd	          }|dd         |dd         z
                                  }|                     ||||dddf         |dddf                   }| j        D ]}	 |	|||
          }|                     |          }|                    d| j        | j        |j        d                   }|                    dddd          }|                     |                              d| j        j                  }|                     |          }|S )az  
        Args:
            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
                The final hidden states of the model.
            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
                The temporal, height and width of feature shape of each image in LLM.

        Returns:
            `torch.Tensor`: hidden_states.
        r7   r   Nr   r6   r   r   )r   r   )r   )r  r  r	   )r  r  r  r+   r   r   r   rW  cumsumjit
is_tracingr:   int32r   padr  r   r  r  rq   r  rF   r   r  r\   rT   r  )
r/   r@   r  r  image_type_idsrI  r  r  seqlensblks
             r3   rC   zGlm4vVisionModel.forward  s7    ((7700??)-)9)9()C)C&i8bAAA"wwyy#''))4,Xaaad^hqqq!tn-LhWXWXWXZ[W[n]]dd
 %*I$8$8$:$:K(.. e 
 

 U:vQ777
abb>JssO3;;==w.YZYZYZ\]Y]J^`nopopoprsos`tuu; 	 	CC%$7  MM ++M::%**')@-BUVXBY
 
 &--aAq9966;;B@[\\M22r4   r1  )rI   rJ   rK   r!   r   r  r(   r  r+   rt   rC   rL   rM   s   @r3   r  r    s         +,     8' ' ':.U\ .U\ .el . . . . . . . .r4   r  c                   .    e Zd ZU eed<   def fdZee	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j
                 dee         dee	j                 d	ee         d
ee	j
                 dee         deeef         fd                        Z xZS )Glm4vTextModelr\   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        t!                    | _        d| _        |                                  d S )Nc                 0    g | ]}t          |          S rQ  )r  )r   rb  r\   s     r3   r   z+Glm4vTextModel.__init__.<locals>.<listcomp>  s$    ggg)"6955gggr4   r*  r\   F)r'   r(   pad_token_idpadding_idx
vocab_sizer)   r   r0   embed_tokensr  r   num_hidden_layerslayersr$   r+  normr3  
rotary_embr  r  r   s    `r3   r(   zGlm4vTextModel.__init__  s       !. +L):F<NPTP`aamgggguVMeGfGfggg
 
 !!39LMMM	2&AAA&+#r4   N	input_idsr   r   rj  inputs_embedsr  rk  r   re   c           
      0   |d u |d uz  rt          d          |r5|3t          j                                        st	          | j                  }||                     |          }|B||                                nd}	t          j        |	|	|j	        d         z   |j
                  }|8|                    ddd                              d|j	        d         d          }n@|                                dk    r(|d	                             d|j	        d         d          }t          | j        |||||
          }
|}|                     ||          }| j        D ]} ||f||
|||d|}|}|                     |          }t%          ||          S )N:You must specify exactly one of input_ids or inputs_embedsr  r   r   r   r7   r	   r6   )N.)r\   input_embedsr   rk  rj  r   )r  r   r   rj  rk  )r  rj  )
ValueErrorr+   r  r  r   r\   r  get_seq_lengthr   rF   r   rq   r   ry   r   r  r  r  r   )r/   r  r   r   rj  r  r  rk  r   past_seen_tokensr   r@   r  decoder_layerlayer_outputss                  r3   rC   zGlm4vTextModel.forward(  s    -t";< 	[YZZZ  	?09M9M9O9O0*$+>>>O  --i88M!CRC^==???de"\ "2]5H5K"KTaTh  N
 )..q!R88??=CVWXCY[]^^LL1$$'	299!\=OPQ=RTVWWL(;&))+%
 
 
 & #oom\JJ![ 
	* 
	*M)M$7*) /-   M *MM		-00&++
 
 
 	
r4   )NNNNNNN)rI   rJ   rK   r    r   r(   r   r   r   r+   rt  rt   r   r~  rb   r   r   r   rE   r   rC   rL   rM   s   @r3   r  r    sC                 151537+/59$(59@
 @
E,-@
 !.@
 u/0	@

 "%@
   12@
 D>@
 !!12@
 -.@
 
u--	.@
 @
 @
  ^@
 @
 @
 @
 @
r4   r  c                   4    e Zd ZU dZi ZdZeed<   ddgZ fdZ	d Z
d Zd	 Zd
 Z	 	 	 	 d deej                 deej                 deej                 deej                 deej        ej        f         f
dZ	 d!dej        deej                 fdZd!dej        deej                 fdZ	 	 d"dej        dej        deej                 deej                 fdZee	 	 	 	 	 	 	 	 	 	 	 d#deej                 deej                 deej                 dee         deej                 deej                 deej                 deej                 deej                 deej                 deej                 dee         deeef         fd                        Z xZ S )$
Glm4vModel Fr\   r  r(  c                    t                                          |           t                              |j                  | _        t                              |j                  | _        d | _	        | 
                                 d S r^   )r'   r(   r  _from_configvision_configvisualr  text_configlanguage_modelr  r  r   s     r3   r(   zGlm4vModel.__init__v  sl       &33F4HII,99&:LMM 	r4   c                 4    | j                                         S r^   )r  get_input_embeddingsrG   s    r3   r  zGlm4vModel.get_input_embeddings  s    "77999r4   c                 :    | j                             |           d S r^   )r  set_input_embeddingsr/   r   s     r3   r  zGlm4vModel.set_input_embeddings  s    0077777r4   c                     || _         d S r^   r  r/   decoders     r3   set_decoderzGlm4vModel.set_decoder  s    %r4   c                     | j         S r^   r  rG   s    r3   get_decoderzGlm4vModel.get_decoder  s    ""r4   Nr  image_grid_thwvideo_grid_thwr   re   c           
         | j         j        j        }| j         j        }| j         j        }| j         j        }g }	|6||1|}
|t          j        |
          }t          j        d|j	        d         |j	        d         |j
        |j                  }d\  }}d}|                    |
j                  }t          |
          D ]|\  }}|||         dk             }|                                }g }d}|D ]d}||k    rd}n||k    rd}||k    r|s|                    d	           1||k    r|r|                    d
           O|                    d           eg }t!          j        t          |          d           D ]K\  }}t%          |          }|d         d         }|d         d         dz   }|                    |||f           Lg }d}|D ]\  }}}t'          |          dk    r|d                                         dz   nd}|d	k    r||         d         ||         d         ||         d         }!} }|                                |                                 |z  |!                                |z  }$}#}"t          j        |"                              dd                              d|#|$z                                            }%t          j        |#                              ddd                              |"d|$                                          }&t          j        |$                              ddd                              |"|#d                                          }'|                    t          j        |%|&|'g          |z              |dz  }d}|d
k    r|||         d         ||         d         }!} }||                                 |z  |!                                |z  }$}#}"t7          |"          D ]!}(t          j        |(                              dd                              d|#|$z                                            }%t          j        |#                              ddd                              dd|$                                          }&t          j        |$                              ddd                              d|#d                                          }'|                    t          j        |%|&|'g          |z              #|dz  }|||         d         k    r|dz  }d}|dz  }||z
  })|                    t          j        |)                              dd                              dd          |z              d}t          j        |d                              dd          }*|*                    |j                  |d|||         dk    f<   |	                    |*                                dz   t'          |
|                   z
             ~t          j        |	|j                                      d          }	||	fS ||                                 !                    d          dz
  }|"                    |dk    d           |                    d                              ddd                              |j                  }|                    dd          d                             dd          d         }+|+dz   |j	        d         z
  }	nt          j        |j	        d         |j                                      ddd                              d|j	        d         d          }t          j#        |j	        d         dg|j        |j
                  }	||	fS )aU  
        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.

        Explanation:
            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.

            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
            Examples:
                input_ids: [T T T T T], here T is for text.
                temporal position_ids: [0, 1, 2, 3, 4]
                height position_ids: [0, 1, 2, 3, 4]
                width position_ids: [0, 1, 2, 3, 4]

            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
            and 1D rotary position embedding for text part.
            Examples:
                Temporal (Time): 3 patches, representing different segments of the video in time.
                Height: 2 patches, dividing each frame vertically.
                Width: 2 patches, dividing each frame horizontally.
                We also have some important parameters:
                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
                text temporal position_ids: [101, 102, 103, 104, 105]
                text height position_ids: [101, 102, 103, 104, 105]
                text width position_ids: [101, 102, 103, 104, 105]
                Here we calculate the text start position_ids as the max vision position_ids plus 1.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
                it.
            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
                The temporal, height and width of feature shape of each image in LLM.
            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
                The temporal, height and width of feature shape of each video in LLM.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

        Returns:
            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
        Nr	   r   r   r:   r   )r   r   FTimagevideotextc                     | d         S )Nr   rQ  )r   s    r3   <lambda>z+Glm4vModel.get_rope_index.<locals>.<lambda>  s    [\]^[_ r4   r7   r6   r   .r  r8   r   )$r\   r  r  image_token_idvideo_start_token_idvideo_end_token_idr+   	ones_liker,   rF   r:   r   r;   rV  r  r  	itertoolsgroupbyr   r   r  itemr   rq   r   rL  r   r   r   r   r   r   r   r  masked_fill_zeros),r/   r  r  r  r   r  r  r  r  mrope_position_deltastotal_input_idsr   image_indexvideo_indexvideo_group_indexr   input_tokensinput_token_typevideo_check_flgtokeninput_type_groupr   groupstart_index	end_indexllm_pos_ids_listvideo_frame_nummodality_type	start_idxend_idxst_idxr  r  r  
llm_grid_t
llm_grid_h
llm_grid_wt_indexh_indexw_indext_idxtext_lenllm_positionsmax_position_idss,                                               r3   get_rope_indexzGlm4vModel.get_rope_index  s   v "[6I3#{?![; " n&@ND^'O%!&!A!A :""o '  L (,$K !+../EFFN )/ : : W` W`9%nQ&71&<=	(//11#% "') 8 8E 444*."444*/...(//8888.00_0(//8888(//7777#% "+"3I>N4O4OQ_Q_"`"` K KJC KKE"'(1+K %b	!q 0I$++S+y,IJJJJ#% "#9I 7, 7,5M9g?BCS?T?TWX?X?X-b15577!;;^_F$//*;7:*;7:*;7:  1 FFHHFFHH(::FFHH(:: 1;J
 #(,z":":"?"?A"F"F"M"MbR\_iRi"j"j"r"r"t"t"',z":":"?"?2q"I"I"P"PQ[]_ak"l"l"t"t"v"v"',z":":"?"?1b"I"I"P"PQ[]gik"l"l"t"t"v"v(//Wgw<W0X0X[a0abbb#q(*+&'11+*;7:*;7:  1 FFHH(::FFHH(:: 1;J
 &+:%6%6 g gE&+l5&9&9&>&>r1&E&E&L&LRQ[^hQh&i&i&q&q&s&sG&+l:&>&>&C&CAr1&M&M&T&TUVXZ\f&g&g&o&o&q&qG&+l:&>&>&C&CAq"&M&M&T&TUVXbdf&g&g&o&o&q&qG,33EK'SZ@[4\4\_e4effff)Q.),{0KA0NNN'1,K01-'1, $+Y#6(//X0F0F0K0KAr0R0R0Y0YZ[]_0`0`ci0ijjj*+ %	*: B B B J J1b Q Q?L?O?OP\Pc?d?dS!^A%6!%;;<%,,]->->-@-@1-Ds?[\K]G^G^-^____$)L1FyO_$`$`$`$j$jkl$m$m!!666)-2244;;B??!C)).A*=qAAA+55a88??2rJJMMnNcdd#/#3#3Au#3#E#Ea#H#L#LRY]#L#^#^_`#a (81(<~?STV?W(W%% L!3I<LMMMT!Q^^VAyq1266 
 )._Q'+$+#/) ) )%  !666r4   pixel_values_videosc                 :   |                     | j        j                  }g }|D ]}\  }}}t          j        d|                                |                                g                              d                              |d          }|                    |           ~t          j	        |d          }|                     ||          }	|
                    d          | j        j        dz  z                                  }
t          j        |	|
          }	|	S )a  
        Encodes videos into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input videos.
            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
                The temporal, height and width of feature shape of each video in LLM.
        r   r   r   r  r7   r6   )r7  r  r:   r+   r   r  r   r   r  r   prodr  r  r  )r/   r   r  temp_frames_hwr  r  r  repeated_rowflattened_video_grid_thwvideo_embedssplit_sizess              r3   get_video_featureszGlm4vModel.get_video_featuresI  s    266t{7HII% 	0 	0GAq! <AFFHHaffhh(?@@JJ1MMTTUVXYZZL!!,////#(9^#C#C#C {{#6AY{ZZ%**2..$+2PRS2SS[[]]{<==r4   pixel_valuesc                 
   |                     | j        j                  }|                     ||          }|                    d          | j        j        dz  z                                  }t          j        ||          }|S )a  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
                The temporal, height and width of feature shape of each image in LLM.
        r"  r7   r6   )r7  r  r:   r#  r  r  r+   r  )r/   r*  r  image_embedsr(  s        r3   get_image_featureszGlm4vModel.get_image_featuresa  sw     $(():;;{{<.{II%**2..$+2PRS2SS[[]]{<==r4   r  image_featuresvideo_featuresc                 x   || |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n || j        j        k    }|| j        j        k    }|	                                }|
                    d                              |                              |j                  }|P||                                         |                                k    r t          d| d|j        d                    |	                                }|
                    d                              |                              |j                  }|P||                                         |                                k    r t          d| d|j        d                    ||fS )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr  r7   z6Image features and image tokens do not match: tokens: z, features r   z7Videos features and video tokens do not match: tokens: )r  r+   r   r\   r  r   r   allvideo_token_idsumr   	expand_asr;   numelr  rF   )	r/   r  r  r.  r/  special_image_maskspecial_video_maskn_image_tokensn_video_tokenss	            r3   get_placeholder_maskzGlm4vModel.get_placeholder_maskq  s;    !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!; "+dk.H!H!*dk.H!H+//11/99"==GGVVYYZgZnoo%-8J*K*Q*Q*S*SWeWkWkWmWm*m*m}}}drdxyzd{}}   ,//11/99"==GGVVYYZgZnoo%-8J*K*Q*Q*S*SWeWkWkWmWm*m*m~.~~eseyz{e|~~   "#555r4   r   rj  r  rk  r   c           
         |du |duz  rt          d          | |                                 |          }|{|                     ||          }t          j        |d                              |j        |j                  }|                     |||          \  }}|	                    ||          }|{| 
                    ||	          }t          j        |d                              |j        |j                  }|                     |||          \  }}|	                    ||          }|t          |t                    s|n|d         }|p|j        dk    ret          j        |dddf         d	d
          }|j        j        r8|t          j        |j                  j        z  }d|z
                                  }t'                      o)|dur|j        d	         d	k    p|duo|j        d	         d	k    }t'                       o+|dur|d         dk    p|du p|                                dk    }|s	|s| j        $|                     |||	|          \  }}
|
| _        n|j        \  }}}|(|d         | j        z                       |j                  nd}t          j        ||j                  }|                    d	d                              |d          }|%|                    ||j        d         z  d          }|                    |          }|                    d                              ddd          } | j        dd|||||d|}t?          |j         |j!        |j"        |j#        | j                  S )a  
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.
        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
            The rope index difference between sequence length and multimodal rope.
        Nr  r   r   )r.  )r/  full_attention   r   r6   )dim1dim2r|   )r   r  r7   r	   )r  r   r   rj  r  rk  )r  rj  r@   r  r  rQ  )$r  r  r-  r+   r   r;   r   r:   r:  masked_scatterr)  r   dictndimdiagonalis_floating_pointfinfominr   r   rF   r  r  r  r   rq   r   rW  addr   r  r  r  rj  r@   r  )r/   r  r   r   rj  r  r*  r   r  r  r  rk  r   r,  
image_maskr#  r'  
video_maskattention_mask_tensorprefill_compiled_stageprefill_noncompiled_stage
batch_sizer   deltaoutputss                            r3   rC   zGlm4vModel.forward  s:   2 -t";< 	[YZZZ 7D5577	BBM#22<PPL 9\q999<<]=QS`SfggL 55i_k5llMJ)88\RRM*223FWWL 9\q999<<]=QS`SfggL 55i_k5llMAz)88\RRM&0&F&FlN[kLl " %05J5OST5T5T(-7LQQQPQT7RYZab(c(c(c%(.@ P,AEKPePkDlDlDp,p)-03H-H,M,M,O,O) &>%?%? &$&B9?1+=+B O!-M-2Ea2HA2M # -E,F,F(F )t+Fq0AQ0F V#t+T/M/M/O/OST/T & ' K*C KHXH`,0,?,?""#8	 -@ - -)k $/   -:,?)
J &1 $A&)99==m>RSSS 
  %|J}?STTT+00B77>>z2NN!-!33J%+a.4PVW3XXE+//66+55a88??2rJJ%$% 
%)+')
 
 
 
 (%7#3!/)(
 
 
 	
r4   rr  r^   r&  )NNNNNNNNNNN)!rI   rJ   rK   r  _checkpoint_conversion_mappingaccepts_loss_kwargsr   r   r  r(   r  r  r  r  r   r+   rt  rt   rE   r  r~  r)  r-  r:  r   r   r   r   r   r   r  rC   rL   rM   s   @r3   r  r  m  sH        %'"02DE    : : :8 8 8& & &# # #
 15595915|7 |7E,-|7 !!12|7 !!12	|7
 !.|7 
u|U\)	*|7 |7 |7 |7~ dh #(#4FNuO_F`   0 u/@ RZ[`[kRl    ( 7;6:'6 '6#'6 ('6 !!23	'6
 !!23'6 '6 '6 '6R  151537+/59/3;?59592659g
 g
E,-g
 !.g
 u/0	g

 "%g
   12g
 u|,g
 &e&78g
 !!12g
 !!12g
 e./g
 !!12g
 +,g
 
u..	/g
 g
 g
  ^g
 g
 g
 g
 g
r4   r  zQ
    Base class for Glm4v causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed<   dS )	Glm4vCausalLMOutputWithPasta  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
        The rope index difference between sequence length and multimodal rope.
    Nlosslogitsrj  r@   r  r  )rI   rJ   rK   rs  rT  r   r+   r~  r   rU  rj  r   r@   rE   r  r  rt  rQ  r4   r3   rS  rS    s           )-D(5$
%,,,*.FHU&'...'+OXe_+++8<M8E%"345<<<59Ju012999.2K%*+22222r4   rS  c            #       r    e Zd Zi ZdgZdZ fdZd Zd Zd Z	d Z
	 d%d	ej        d
eej                 fdZd%dej        deej                 fdZed             Zed             Zee	 	 	 	 	 	 	 	 	 	 	 	 	 d&deej                 deej                 deej                 dee         deej                 deej                 deej                 d	eej                 deej                 d
eej                 deej                 deej                 deeej        f         dee         deeef         fd                        Z	 	 	 	 	 	 	 	 	 	 d' fd	Z	 d%deej                 deej                 deej        ej        f         fd Z 	 	 	 d(d"ed#e!deej                 deej        e"e#e$f         f         fd$Z% xZ&S ))Glm4vForConditionalGenerationzlm_head.weightFc                     t                                          |           t          |          | _        t	          j        |j        j        |j        j        d          | _	        | 
                                 d S )NFrS   )r'   r(   r  r  r)   rV   r  r0   r  lm_headr  r   s     r3   r(   z&Glm4vForConditionalGeneration.__init__)  se       ''
y!3!?ASA^ejkkkr4   c                 4    | j                                         S r^   )r  r  rG   s    r3   r  z2Glm4vForConditionalGeneration.get_input_embeddings0  s    z..000r4   c                 :    | j                             |           d S r^   )r  r  r  s     r3   r  z2Glm4vForConditionalGeneration.set_input_embeddings3  s    
''.....r4   c                 :    | j                             |           d S r^   )r  r  r  s     r3   r  z)Glm4vForConditionalGeneration.set_decoder6  s    
w'''''r4   c                 4    | j                                         S r^   )r  r  rG   s    r3   r  z)Glm4vForConditionalGeneration.get_decoder9  s    z%%'''r4   Nr   r  c                 8    | j                             ||          S r^   )r  r)  )r/   r   r  s      r3   r)  z0Glm4vForConditionalGeneration.get_video_features<  s     z,,-@.QQQr4   r*  r  c                 8    | j                             ||          S r^   )r  r-  )r/   r*  r  s      r3   r-  z0Glm4vForConditionalGeneration.get_image_featuresA  s    z,,\>JJJr4   c                     | j         j        S r^   )r  r  rG   s    r3   r  z,Glm4vForConditionalGeneration.language_modelE  s    z((r4   c                     | j         j        S r^   )r  r  rG   s    r3   r  z$Glm4vForConditionalGeneration.visualI  s    z  r4   r   r  r   r   rj  r  labelsr  rk  logits_to_keepr   re   c                 ~    | j         d||||	|
|||||d
|}|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}|'|                     ||| j        j        j                  }t          |||j
        |j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.
        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
            The rope index difference between sequence length and multimodal rope.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Glm4vForConditionalGeneration

        >>> model = Glm4vForConditionalGeneration.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
        >>> processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")

        >>> messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            },
        ]
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
        ```)
r  r*  r   r  r  r   r   rj  r  rk  r   N)rU  rb  r  )rT  rU  rj  r@   r  r  rQ  )r  r   r   slicerY  loss_functionr\   r  r  rS  rj  r@   r  r  )r/   r  r   r   rj  r  rb  r*  r   r  r  r  rk  rc  r   rO  r@   slice_indicesrU  rT  s                       r3   rC   z%Glm4vForConditionalGeneration.forwardM  s   z $* 
% 3))%)+')
 
 
 
  
 9C>SV8W8Wk~ot444]kmAAA}aaa,?@AA%%VFt{OfOq%rrD*#3!/)+
 
 
 	
r4   Tc                      t                      j        |f|||||||	|
||d
|}d |d<   |d         dk    r
d |d<   d |d<   |S )N)
rj  r   r  rk  r   r*  r   r  r  r  r   r   r*  r   )r'   prepare_inputs_for_generation)r/   r  rj  r   r  rk  r   r  r*  r   r  r  r   model_inputsr2   s                 r3   ri  z;Glm4vForConditionalGeneration.prepare_inputs_for_generation  s    " =uww<
+)')%% 3))
 
 
 
  (,^$!!!+/L(26L./r4   c                 Z   | | |                                  t          j        | j        j        t          j        |j                            k    d         }| |                                  t          j        | j        j        t          j        |j                            k    d         }| |                                  t          j        | j        j        t          j        |j                            k    d         }n0|| j        j        k    }|| j        j        k    }|| j        j        k    }t          j	        |
                                |
                                z
  d          }|dk    }|| z  }|                    d          }	|                    d          }
|	|
fS )aa  
        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

        Returns:
            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
        Nr  ).r   r   r   r   )r  r+   r   r\   image_start_token_idr   r   r  r  r  r   r3  )r/   r  r  is_imageis_video_startis_video_endvideo_levelinside_videostandalone_imagesimage_countsvideo_countss              r3   _get_image_nums_and_video_numsz<Glm4vForConditionalGeneration._get_image_nums_and_video_nums  s   $ $.4,,..L!A\i\pqqq  H .4,,..L!A\i\pqqq  N .4,,..L!?uzZgZnooo  LL !DK$DDH&$+*JJN$(FFL l>#5#5#7#7,:J:J:L:L#LRSTTT"Q %6 ),,,33%))a)00\))r4   r   expand_sizeis_encoder_decoderc                     dk    rfS g d fd}fd} |                               d           |          |r8                    d          t          d           |d                   d<   fS )	Nr   )r*  r  r   r  second_per_grid_tsc                                         dd           }                     dd           }                    
                     dd                     \  }}d }| D ]}|dk    rFt          j        |t	          |                    }d |D             } || |         |	          | |<   O|dk    r't	          |          } || |         |	          | |<   ||d	k    rFt          j        |t	          |                    }d
 |D             } || |         |	          | |<   |dk    r't	          |          } || |         |	          | |<   |dk    r$ || |         t	          |          	          | |<   !| S )Nr  r  r  )r  c                     t          j        | |          }|gdg|                                 dz
  z  z   t          j        fd|D             d          }|S )Nr   c                 $    g | ]} |j          S rQ  r   )r   samplerepeat_argss     r3   r   zGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual.<locals>._repeat_interleave_samples.<locals>.<listcomp>'  s"    #V#V#VFMFM;$?#V#V#Vr4   r   r   )r+   r  ry   r   )r   r   repeat_timessamplesresultr~  s        @r3   _repeat_interleave_sampleszGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual.<locals>._repeat_interleave_samples$  sb    +a11+nsaeeggk/BB#V#V#V#Vg#V#V#V\]^^^r4   r*  c                 ^    g | ]*}t          j        |d                                           +S r   r   r+   r#  r3  r   r}  s     r3   r   z{Glm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual.<locals>.<listcomp>/  3    UUU6uz&a888<<>>UUUr4   )r   r  r   c                 ^    g | ]*}t          j        |d                                           +S r  r  r  s     r3   r   z{Glm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual.<locals>.<listcomp>;  r  r4   ry  )r:  ru  r+   r  r   )dict_to_expandr  r  
image_nums
video_numsr  r   r  r   rv  r  model_kwargsr/   s            r3   "_expand_dict_for_generation_visualzgGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual  s   )--.>EEN)--.>EEN%)%H%H)9)9/4)P)P &I & &"J
   &  .((#k.$z:J:JKKGUUWUUUG*D*D&s+W;+ + +N3'' ,,,":..G*D*D&s+W;+ + +N3'' 111#k.$z:J:JKKGUUWUUUG*D*D&s+W;+ + +N3'' ,,,":..G*D*D&s+W;+ + +N3'' 000*D*D&s+T*5E5ET_+ + +N3' "!r4   c                     | D ]T}|dk    rL| |         Dt          | |         t          j                  r$|vr | |                             d          | |<   U| S )Nrk  r   r   )r   r+   rt   rW  )r  r   rv  visual_keyss     r3   _expand_dict_for_generationz`Glm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generationJ  sz    % d d+++&s+7">##6EE 8;..*8*=*O*OP[ab*O*c*cN3'!!r4   r   r   encoder_outputszMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)rW  r:  r  )r/   rv  rw  r  r  r  r  r  s   `` ``  @r3   _expand_inputs_for_generationz;Glm4vForConditionalGeneration._expand_inputs_for_generation  s    !l**www+	" +	" +	" +	" +	" +	" +	" +	"Z		" 		" 		" 		" 		" 		" :9,GG !33KQ3GGI22<@@ 	k 122: !pqqq.I.I,WhJi.j.jL*+,&&r4   r^   )NNNNNNNNNNNNr   )
NNNNNTNNNN)r   FN)'rI   rJ   rK   rP  _tied_weights_keysrQ  r(   r  r  r  r  r+   r~  r   rt  r)  r-  propertyr  r  r   r   rt   r   r   r   r   r   rE   rS  rC   ri  ru  rb   rA  r   r   r  rL   rM   s   @r3   rW  rW  #  s       %'"*+    1 1 1/ / /( ( (( ( ( dhR R#(#4RFNuO_F`R R R R
K Ku/@ KRZ[`[kRl K K K K ) ) X) ! ! X!  151537+/59-1/3;?5959265934Z
 Z
E,-Z
 !.Z
 u/0	Z

 "%Z
   12Z
 )*Z
 u|,Z
 &e&78Z
 !!12Z
 !!12Z
 e./Z
 !!12Z
 c5</0Z
 +,Z
  
u11	2!Z
 Z
 Z
 ^ Z
~  ' ' ' ' ' 'X 156* 6*E,-6*  -6* 
u|U\)	*	6* 6* 6* 6*t #(04	U' U'U' !U' E,-	U' 
uc3h/	0U' U' U' U' U' U' U' U'r4   rW  )rW  r  r  r  )r   r   )Sr  dataclassesr   typingr   r   r   r   r+   torch.nnr)   torch.nn.functionalr   r   r   activationsr
   cache_utilsr   r   
generationr   integrationsr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   configuration_glm4vr   r    r!   Moduler$   rO   rd   rv   r   r   r   rt   rE   r   r   r   r   r  r  r(  r3  rM  r_  ra  rv  r  r  r  r  r  r  rS  rW  __all__rQ  r4   r3   <module>r     s:  *     ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1                       ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) 7 7 7 7 7 7 / / / / / / B B B B B B 9 9 9 9 9 9 D D D D D D D D K K K K K K K K F F F F F F F F & & & & & & c c c c c c c c c c c c / / / / / / P P P P P P P P P P Y''J J J J J29 J J ('J(f f f f fBI f f f    BI   (       f f f f fRY f f f"T T T T TBI T T Tn( ( (|+0<>Cl
5<%&   	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4P P P P P29 P P Pf    1   6#< #< #< #< #<ry #< #< #<L6 6 66 6 6 6rH) H) H) H) H) H) H) H)V) ) ) ) )29 ) ) )$2 2 2 2 26 2 2 2j   
3 3 3 3 3{ 3 3  3$     ?   "k k k k k+ k k k\ U
 U
 U
 U
 U
) U
 U
 U
p U
 U
 U
 U
 U
% U
 U
 U
p   
3 3 3 3 3+ 3 3  3.~' ~' ~' ~' ~'$8/ ~' ~' ~'B
 d
c
cr4   