
    .`i)_              
       d   d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>  G d dej?                  Z@ G d dej?                  ZA G d d ej?                  ZB G d! d"ej?                  ZCe G d# d$ej?                              ZD G d% d&ej?        e4e6e8e5e7          ZEdS )'zInference-only FalconH1 model.    )Iterable)isliceN)nn)FalconH1Config)	Attention)support_torch_compile)CacheConfigModelConfig
VllmConfig)$get_tensor_model_parallel_world_size)get_pp_group)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)MambaMixer2)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors)set_default_rope_theta   )HasInnerStateIsHybridSupportsLoRASupportsMambaPrefixCaching
SupportsPP)PPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                   H     e Zd Z	 	 	 ddededz  dededdf
 fd	Zd
 Z xZ	S )FalconH1MLPNF configquant_configbiasprefixreturnc                    t                                                       t          |j        |j        gdz  ||| d          | _        t          |j        |j        ||| d          | _        t                      | _	        |j        | _        |j
        \  | _        | _        |j        dk    rt          d|j         d          t                      | _        d S )	N   z.gate_up_proj)
input_sizeoutput_sizesr0   r/   r1   z
.down_proj)r5   output_sizer0   r/   r1   siluzUnsupported activation: z!. Only silu is supported for now.)super__init__r   hidden_sizeintermediate_sizegate_up_projr   	down_projr   tp_sizemlp_multipliersgate_multiplierdown_multiplier
hidden_act
ValueErrorr   act_fn)selfr.   r/   r0   r1   	__class__s        x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/falcon_h1.pyr:   zFalconH1MLP.__init__;   s    	6) 23a7%+++
 
 
 +/*%(((
 
 
 <==!'!95;5K2d2&&26+< 2 2 2   !ll    c                     |                      |          \  }}|d d d | j        | j        z  fxx         | j        z  cc<   |                     |          }|                     |          \  }}|| j        z  }|S N)r=   r<   r?   rA   rE   r>   rB   )rF   x_s      rH   forwardzFalconH1MLP.forward[   s      ##1	!!!5t%55
5666$:NN666KKNN~~a  1$$rI   )NFr-   )
__name__
__module____qualname__r   r   boolstrr:   rN   __classcell__rG   s   @rH   r,   r,   :   s         37# ## )4/# 	#
 # 
# # # # # #@      rI   r,   c                        e Zd Z	 	 	 	 ddededz  dedz  dedz  deddf fd	Zd
 Z	de
j        de
j        dz  fdZ xZS )FalconH1SSMDecoderLayerNr-   r.   model_configcache_configr/   r1   r2   c                 H   t                                                       || _        t                      | _        |j        t          |j        |j        z            n|j        | _	        t          di d|j        d|j        d|j        d| j	        d|j        d|j        d|j        d|j        d	|j        d
|j        d|j        d|d|d|d|j        d| d| _        | j        j        |j        z  | _        |j        | _        |                                  d S )Nr;   ssm_state_sizeconv_kernel_sizer<   use_conv_biasuse_biasn_groups	num_headshead_dimrms_norm_eps
activationrX   rY   r/   use_rms_normr1   z.mixer )r9   r:   r.   r   r?   mamba_d_ssmintmamba_expandr;   d_ssmr   mamba_d_statemamba_d_convmamba_conv_biasmamba_proj_biasmamba_n_groupsmamba_n_headsmamba_d_headrb   rC   mamba_rms_normmambar_   groups_time_state_sizessm_multiplierszxbcdt_multipliers_init_mup_vector)rF   r.   rX   rY   r/   r1   rG   s         rH   r:   z FalconH1SSMDecoderLayer.__init__e   s    	;== !) #f&88999# 	
 ! 
 
 
**
!//
 $00
 #jj	

 !00
 ++
 **
 **
 ((
  ,,
 ((
 &
 &
 &
  ..
  $$$$!

& '+j&9F<P&P#"("8rI   c                 >   d| j         z  d| j        z  z   | j        j        z   | j        z  }t          j        d|          }|ddd| j         | j        z  fxx         | j        d         z  cc<   |dd| j         | j        z  d| j         z  | j        z  fxx         | j        d         z  cc<   |ddd| j         z  | j        z  d| j         z  | j        z   | j        z  fxx         | j        d         z  cc<   |ddd| j         z  | j        z   | j        z  d| j         z  d| j        z  z   | j        z  fxx         | j        d         z  cc<   |ddd| j         z  d| j        z  z   | j        z  dfxx         | j        d         z  cc<   |                     d|d	           dS )
u  
        Non learnable per-block scaling vector composed of element-wise
        multipliersapplied to each separate contiguous block of the output
        of the linear projection (in_proj) before further processing
        (gating, convolution, SSM):

            - Z block:  [0 : d_ssm]                      → zxbcdt_multipliers[0]
            - X block:  [d_ssm : 2 * d_ssm]              → zxbcdt_multipliers[1]
            - B block:  [2 * d_ssm : 2 * d_ssm + G * S]  → zxbcdt_multipliers[2]
            - C block:  [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S]
                        → zxbcdt_multipliers[3]
            - dt block: [2 * d_ssm + 2 * G * S : end]    → zxbcdt_multipliers[4]

        where:
            - d_ssm:     Dimension of state-space model latent
            - G:         Number of groups (n_groups)
            - S:         SSM state size per group
            - All indices are divided by tp_size to support tensor parallelism
        r4   r    Nr         
mup_vectorF)
persistent)	ri   rs   r.   ro   r?   torchonesru   register_buffer)rF   vector_shaperz   s      rH   rv   z(FalconH1SSMDecoderLayer._init_mup_vector   sO   * 
NQ!<<<t{?XX\ Z<00
1112
dl222333t7Nq7QQ333AA
dl*q4:~/MNN	
 	
 	
$Q'	( 	
 	
 	
 	AA^,DJ!<<|0 	
 	
 	
 $Q'	( 	
 	
 	
 	AA^d99dlJDJT%@!@@|N 	
 	
 	
 $Q'	( 	
 	
 	
 	AA^a$"===$,NPPR	
 	
 	
 $Q'	( 	
 	
 	

 	\:%HHHHHrI   hidden_statesresidualc                 B    |                      || j                  }||fS )N)rz   )rr   rz   )rF   r   r   kwargsoutputs        rH   rN   zFalconH1SSMDecoderLayer.forward   s1       
 
 xrI   NNNr-   )rO   rP   rQ   r   r
   r	   r   rS   r:   rv   r|   TensorrN   rT   rU   s   @rH   rW   rW   d   s         ,0+/26'  ' '  "D('  "D(	' 
 )4/'  '  
'  '  '  '  '  ' R6I 6I 6Ip
 |
  ,%
  
  
  
  
  
  
  
 rI   rW   c                        e Zd Z	 	 	 ddededz  dedz  deddf
 fdZd	ej	        d
ej	        dej	        fdZ
d	ej	        d
ej	        dej	        dz  fdZ xZS )FalconH1AttentionDecoderLayerNr-   r.   rY   r/   r1   r2   c           
      d   t                                                       t          |d           t          |dd          }|j        | _        t                      }|j        | _        | j        |z  dk    sJ | j        |z  | _        |j	        | _
        | j
        |k    r| j
        |z  dk    sJ n|| j
        z  dk    sJ t          d| j
        |z            | _        t          |dd           |j        | j        z  n|j        | _        | j        | j        z  | _        | j        | j        z  | _        | j        dz  | _        || _        t          |d	| j                  }|| j        z  |j        d
<   t'          | j        ||j        dd           | _        t+          |j        | j        | j        | j
        d|| d          | _        t/          | j        | j        z  |j        d|| d          | _        t3          | j        | j        | j        | j        || d          | _        |j        | _        d S )Ng   vH7B)default_thetamax_position_embeddingsi    r   r    ra   g      attn_rotary_embpartial_rotary_factorT)	head_sizemax_positionrope_parametersis_neox_styledtypeFz	.qkv_proj)r0   r/   r1   z.o_projz.attn)num_kv_headsrY   r1   )r9   r:   r   getattrr;   r   num_attention_headstotal_num_headsr`   num_key_value_headstotal_num_kv_headsmaxr   ra   q_sizekv_sizescalingr   r   r   
rotary_embr   qkv_projr   o_projr   attnkey_multiplier)	rF   r.   rY   r/   r1   r   r?   
rotary_dimrG   s	           rH   r:   z&FalconH1AttentionDecoderLayer.__init__   s    	vT::::")&2KT"R"R!-688%9#g-2222-8"("<"g-- *W499999 T4499994#:g#EFF vz4008 $"666 	
 nt}4(4=8}d*'>$V%6FF
:Dt}:T67"m0"2
 
 
 *M #%'''
 
 
 ( 4=0%%%%
 
 
 NML*%###
 
 
	 %3rI   	positionsr   c                 6   |                      |          \  }}|                    | j        | j        | j        gd          \  }}}|| j        z  }|                     |||          \  }}|                     |||          }	|                     |	          \  }
}|
S )N)dim)r   splitr   r   r   r   r   r   )rF   r   r   r   qkvrM   qkvattn_outputr   s              rH   self_attentionz,FalconH1AttentionDecoderLayer.self_attention  s     }--Q))T[$,E2)NN1a##y!Q//1ii1a((KK,,	rI   r   c                 8    |                      ||          }||fS )Nr   r   )r   )rF   r   r   r   r   s        rH   rN   z%FalconH1AttentionDecoderLayer.forward,  s1     ++' , 
 
 h&&rI   )NNr-   )rO   rP   rQ   r   r	   r   rS   r:   r|   r   r   rN   rT   rU   s   @rH   r   r      s        ,026G4 G4G4 "D(G4 )4/	G4
 G4 
G4 G4 G4 G4 G4 G4R< |
 
   '<' |' ,%	' ' ' ' ' ' ' 'rI   r   c                        e Zd ZdZ	 	 	 	 ddedededz  dedz  dedz  d	e	d
df fdZ
dej        dej        fdZ xZS )FalconH1ParallelHybrida  
    A hybrid decoder layer for FalconH1 where the input is processed
    in parallel through both the self-attention branch and the SSM (Mamba)
    branch. Their outputs are then summed to produce the final hidden state.

    This layer uses:
      - FalconH1AttentionDecoderLayer for the multi-head self-attention branch.
      - FalconH1SSMDecoderLayer for the state-space (Mamba) branch.
    Nr-   r.   	layer_idxrX   rY   r/   r1   r2   c                    t                                                       t          ||||          | _        |j        |z   }|                    d          d         d| z   }t          |||||          | _        |j        | _        |j	        | _	        |j
        | _
        |j        | _        t          || d          | _        t          |j        |j                  | _        t          |j        |j                  | _        d S )N)r.   rY   r/   r1   .r   )r.   rX   rY   r/   r1   z.feed_forwardr1   eps)r9   r:   r   	self_attnnum_hidden_layersr   rW   rr   ssm_out_multiplierssm_in_multiplierattention_in_multiplierattention_out_multiplierattn_out_multiplierr,   feed_forwardr   r;   rb   input_layernormpre_ff_layernorm)
rF   r.   r   rX   rY   r/   r1   ssm_layer_idx
ssm_prefixrG   s
            rH   r:   zFalconH1ParallelHybrid.__init__E  s!    	 7%%	
 
 
 09<\\#&&q),?,?,??
 -%%%
 
 

 #)";!'!9'-'E$#)#B '&7O7O7OPPP&v'9v?RSSS '(:@S T T TrI   r   r   c                 8   |}|                      |          } | j        d||| j        z  |d|\  }} | j        d|| j        z  |d|\  }}|| j        z  || j        z  z   }||z   }|}|                     |          }|                     |          }||z   }|S )N)r   r   r   )r   r   re   )	r   r   r   rr   r   r   r   r   r   )rF   r   r   r   r   attn_hiddenrM   
ssm_hiddens           rH   rN   zFalconH1ParallelHybrid.forwardp  s     !,,];; ( 
'$*FF
 
 	
 
Q #
 
'$*@@
 
 
 

A %t'??00
 &0 !--m<<))-88 =0rI   r   )rO   rP   rQ   __doc__r   rg   r
   r	   r   rS   r:   r|   r   rN   rT   rU   s   @rH   r   r   :  s          ,0+/26)U )U)U )U "D(	)U
 "D()U )4/)U )U 
)U )U )U )U )U )UV(<( |( ( ( ( ( ( ( (rI   r   c                        e Zd Zdddedef fdZdej        dej        fdZ	 	 ddej        d
ej        de	d	z  dej        d	z  dej        f
dZ
 xZS )FalconH1Modelr-   r   vllm_configr1   c                   t                                                       |j        j        |j        |j        |j        | _        j        | _        t                      j	        r,t          | j        j                  | _        j        | _        nt                      | _        d| _        dt          ffd}t!          j        || d          \  | _        | _        | _        t+          ddgj                  | _        t                      j        r"t1          j        j                  | _        d S t                      | _        d S )	Ng      ?r1   c                     t          |                     dd          d                   }t          } |||           S )Nr   r    )r/   r1   )rg   rsplitr   )r1   r   layer_classrY   r.   rX   r/   s      rH   	get_layerz)FalconH1Model.__init__.<locals>.get_layer  sS    FMM#q11!455I0K;)   rI   z.layersr   r   r   r   )r9   r:   rX   	hf_configrY   r/   r.   
vocab_sizer   is_first_rankr   r;   embed_tokensembedding_multiplierr&   rS   r)   r   start_layer	end_layerlayersr(   make_empty_intermediate_tensorsis_last_rankr   rb   final_layernorm)	rF   r   r1   r   rY   r.   rX   r/   rG   s	       @@@@rH   r:   zFalconH1Model.__init__  sw   !,!9!C"/"/"/ +>>' 	, 6"! !D )/(CD%% . 0 0D(+D%
	c 
	 
	 
	 
	 
	 
	 
	 
	 
	 9D$i68J8J8J9
 9
 9
5$.$+ 0Wj)6+=0
 0
, >>& 	4#*6+=6CV#W#W#WD   #1#3#3D   rI   	input_idsr2   c                 ,    |                      |          S rK   )r   rF   r   s     rH   embed_input_idszFalconH1Model.embed_input_ids  s      +++rI   Nr   intermediate_tensorsinputs_embedsc                 l   t                      j        r+||| j        z  }n*|                     |          | j        z  }n|J |d         }t	          | j        | j        | j                  D ]} |||          }t                      j        st          d|i          S | 
                    |          }|S )Nr   r   )r   r   r   r   r   r   r   r   r   r   r   )rF   r   r   r   r   r   layers          rH   rN   zFalconH1Model.forward  s     >>' 		B( -0I I ((33d6OO  (3330AMDK)94>JJ 	 	E!E#+  MM ~~* 	&#]  
 ,,];;rI   NN)rO   rP   rQ   r   rS   r:   r|   r   r   r   rN   rT   rU   s   @rH   r   r     s        AC *4 *4 *4z *43 *4 *4 *4 *4 *4 *4X, ,%, , , , , <@-1 < < 2D8	
 |d* 
       rI   r   c            
           e Zd Zg dddgdZdddZedd	d
eej        ej        f         fd            Z	edd	d
eee
e
f         ee
e
e
f         f         fd            Zed
eeef         fd            Zdddedef fdZdej        d
ej        fdZ	 	 ddej        dej        dedz  dej        dz  fdZdej        d
ej        dz  fdZdeeeej        f                  d
ee         fdZ xZS )FalconH1ForCausalLM)q_projk_projv_proj	gate_projup_proj)r   r=   input_embeddingsoutput_embeddings)r   lm_headr   r   r2   c                 j    t          j        |j        j        |j        j        |j        j                  S rK   )r   mamba2_state_dtyperX   r   rY   mamba_cache_dtypemamba_ssm_cache_dtype)clsr   s     rH   !get_mamba_state_dtype_from_configz5FalconH1ForCausalLM.get_mamba_state_dtype_from_config  s4    
 );$*$6$:
 
 	
rI   c           	          |j         }|j        j        }|j        t	          |j        |j        z            n|j        }t          j        ||j	        |j
        |j        |j        |j        |j                  S )a3  Calculate shapes for Mamba's convolutional and state caches.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
            - temporal_state_shape: Shape for state space model cache
        N)r<   tp_world_sizer_   r`   ra   
state_sizeconv_kernel)parallel_configrX   r   rf   rg   rh   r;   r   mamba2_state_shapetensor_parallel_sizern   ro   rp   rj   rk   )r   r   r   r   r<   s        rH   !get_mamba_state_shape_from_configz5FalconH1ForCausalLM.get_mamba_state_shape_from_config
  s     &5,6	 $, 	&)>>???& 	 );/)>--+ .!.
 
 
 	
rI   c                 (    t          j                    S rK   )r   mamba2_state_copy_func)r   s    rH   get_mamba_state_copy_funcz-FalconH1ForCausalLM.get_mamba_state_copy_func,  s    +BDDDrI   r-   r   r1   c                   |j         j        }|| _        |j         | _         |j        }|j        | _        t                                                       || _        || _        t          |t          |d                    | _
        |j        | _        t                      j        rt          |j        |j        t          |d                    | _        |j        | _        | j        r)| j                            | j
        j                  | _        t+          |j        |j        |j                  | _        nt/                      | _        | j
        j        | _        d S )Nmodel)r   r1   r   r   )scale)rX   r   r   scheduler_configr/   r9   r:   r.   r   r*   r   tie_word_embeddingsr   r   r   r   r;   r   lm_head_multipliertie_weightsr   r   logits_processorr&   r   )rF   r   r1   r.   r  rG   s        rH   r:   zFalconH1ForCausalLM.__init__0  sT   )3&'4&7'4 0"#L,I,I
 
 

 $*#= >>& 	,)!"#FI66  DL
 '-&?D#' Q#|77
8OPP %4!!/% % %D!! *++DL J6 	,,,rI   r   c                 6    | j                             |          S rK   )r   r   r   s     rH   r   z#FalconH1ForCausalLM.embed_input_idsX  s    z)))444rI   Nr   r   r   c                 6    |                      ||||          }|S rK   )r   )rF   r   r   r   r   r   r   s          rH   rN   zFalconH1ForCausalLM.forward[  s+     

 	
 
 rI   r   c                 <    |                      | j        |          }|S rK   )r  r   )rF   r   logitss      rH   compute_logitsz"FalconH1ForCausalLM.compute_logitsl  s      &&t|]CCrI   weightsc                    g d}t          |                                           }t                      }|D ]!\  }}d|v rd|v r|                    dd          }d|v r|                    dd          }|D ]i\  }}}	||vr|                    ||          }|                    d          r||vr;t          ||           rL||         }
|
j        } ||
||	            na|                    d          r||vrt          ||           r| j        rd|v r||         }
t          |
d	t                    } ||
|           |
                    |           #| j        r|
                    d
           |S )N))r   r   r   )r   r   r   )r   r   r   )r=   r   r   )r=   r   r    zrotary_emb.inv_freqA_logArr   zmamba.mambaz.biasr   weight_loaderzlm_head.weight)dictnamed_parameterssetreplaceendswithr'   r  r  r   r   add)rF   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr  s               rH   load_weightsz FalconH1ForCausalLM.load_weightst  s   "
 "
 "
 4002233"%%%#* %	$ %	$D-$,,$||GS11$||G];;5K 4 41
Kd**||K<<==)) d+.E.E*466 #D) % 3e]H=== ==)) d+.E.E*466 + 	T0A0A#D) '@U V Ve]333d##### 	0.///rI   r   )rO   rP   rQ   packed_modules_mappingembedding_modulesclassmethodtupler|   r   r   rg   r   r   r   r   rS   r:   r   r   r   rN   r  r   r  r   rT   rU   s   @rH   r   r     s>        322$i0  +& 
 
!
 
u{EK'	(
 
 
 [
 
!
 
uS#Xc3m 44	5
 
 
 [
B E%0BDV0V*W E E E [E BD &
 &
 &
z &
3 &
 &
 &
 &
 &
 &
P5 5%, 5 5 5 5 <@-1 < < 2D8	
 |d*   "| 
	   5HU33D-E$F 53s8 5 5 5 5 5 5 5 5rI   r   )Fr   collections.abcr   	itertoolsr   r|   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   r
   r   vllm.distributedr   vllm.distributed.parallel_stater   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   -vllm.model_executor.layers.mamba.mamba_mixer2r   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   vllm.transformers_utils.configr   
interfacesr!   r"   r#   r$   r%   utilsr&   r'   r(   r)   r*   Moduler,   rW   r   r   r   r   re   rI   rH   <module>r<     s   % $ $ $ $ $ $ $              ' ' ' ' ' ' * * * * * * = = = = = = < < < < < < < < < < A A A A A A 8 8 8 8 8 8 < < < < < < 8 8 8 8 8 8         
 H G G G G G E E E E E E            G F F F F F @ @ @ @ @ @        P O O O O O - - - - - - A A A A A A                          ' ' ' ' '") ' ' 'Tl  l  l  l  l bi l  l  l ^d' d' d' d' d'BI d' d' d'N^ ^ ^ ^ ^RY ^ ^ ^B N N N N NBI N N Nb| | | | |I| | | | |rI   