
    .`i                        d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZ  ee          Z G d de          Z G d dej                  Z G d dej                  Z G d de          ZdS )zBInference-only FlexOlmo model compatible with HuggingFace weights.    N)nn)
VllmConfig)$get_tensor_model_parallel_world_size)init_logger)FusedMoE)RMSNorm)ReplicatedLinear)OlmoeAttentionOlmoeForCausalLM)FlexOlmoConfigc                   .     e Zd Zdddedef fdZ xZS )FlexOlmoAttention prefixvllm_configr   c                4   t                                          ||           |j        j        }t	          |t
                    sJ t          | j        | j        z  |j	                  | _
        t          | j        | j        z  |j	                  | _        d S )Nr   r   eps)super__init__model_config	hf_config
isinstancer   r   total_num_kv_headshead_dimrms_norm_epsk_normtotal_num_headsq_normselfr   r   r   	__class__s       x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/flex_olmo.pyr   zFlexOlmoAttention.__init__!   s    [@@@,6	)^44444#dm39O
 
 
  4=0i6L
 
 
    )__name__
__module____qualname__r   strr   __classcell__r$   s   @r%   r   r       sX        AC 
 
 
z 
3 
 
 
 
 
 
 
 
 
 
r&   r   c                   V     e Zd ZdZdddedef fdZdej        dej        fd	Z	 xZ
S )
FlexOlmoMoEa	  A tensor-parallel MoE implementation for FlexOlmo that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    r   r   r   r   c                x   t                                                       |j        j        }t	          |t
                    sJ t                      }t          |j        |j	        ddd | d          | _
        t          |j	        |j        |j        |j        ddd || d	  	        | _        |j        | _        d S )NFz.gate)biasreturn_biasquant_configr   Tz.experts)	num_expertstop_khidden_sizeintermediate_sizereduce_resultsrenormalizer2   tp_sizer   )r   r   r   r   r   r   r   r	   r5   r3   gater   num_experts_per_tokr6   expertsr4   )r#   r   r   r   r9   r$   s        r%   r   zFlexOlmoMoE.__init__8   s    ,6	)^44444688 %!!###
 
 
	  !-/!-'9&&&

 

 

 2


r&   hidden_statesreturnc                 D   |j         }|j         d         }|                    d|          }|                     |          }|                     |                                                                |                                          }|                    |          S )N)r=   router_logits)shapeviewr:   r<   detachclonefloat)r#   r=   
orig_shape
hidden_dimrA   final_hidden_statess         r%   forwardzFlexOlmoMoE.forwardY   s    "(
"(,
%**2z:: 		-00 #ll'..006688'--// + 
 

 #''
333r&   )r'   r(   r)   __doc__r   r*   r   torchTensorrJ   r+   r,   s   @r%   r.   r.   /   s          BD 3 3 3z 33 3 3 3 3 3 3B4U\ 4el 4 4 4 4 4 4 4 4r&   r.   c                        e Zd Zdddededdf fdZdej        d	ej        d
ej        dz  deej        ej        dz  f         fdZ	 xZ
S )FlexOlmoDecoderLayerr   r   r   r   r>   Nc                r   t                                                       |j        j        }t	          |t
                    sJ t          || d          | _        t          |j	        |j
                  | _        t          |j	        |j
                  | _        t          || d          | _        d S )Nz
.self_attnr   r   z.mlp)r   r   r   r   r   r   r   	self_attnr   r5   r   post_attention_layernormpost_feedforward_layernormr.   mlpr"   s       r%   r   zFlexOlmoDecoderLayer.__init__l   s    ,6	)^44444*#v,A,A,A
 
 
 )0!y'=)
 )
 )
% +2!y'=+
 +
 +
' ;&OOOr&   	positionsr=   residualc                     |}|                      ||          }|                     |          }||z   }|}|                     |          }|                     |          }||z   }|d fS )N)rQ   rR   rT   rS   )r#   rU   r=   rV   s       r%   rJ   zFlexOlmoDecoderLayer.forward}   sy     !y-@@55mDD%0 !//77FF =0d""r&   )r'   r(   r)   r   r*   r   rL   rM   tuplerJ   r+   r,   s   @r%   rO   rO   k   s        AC P P Pz P3 P P P P P P P"#<# |# ,%	#
 
u|U\D00	1# # # # # # # #r&   rO   c                   N     e Zd ZdZdeddededeej	                 f fdZ
 xZS )FlexOlmoForCausalLMFr   )r   
layer_typer   r   r[   c                P    t                                          |||           d S )N)r   r   r[   )r   r   )r#   r   r   r[   r$   s       r%   r   zFlexOlmoForCausalLM.__init__   s*     	[JWWWWWr&   )r'   r(   r)   fall_back_to_pt_during_loadrO   r   r*   typer   Moduler   r+   r,   s   @r%   rZ   rZ      s        "' &:X X X  X 	X
 OX X X X X X X X X Xr&   rZ   )rK   rL   r   vllm.configr   vllm.distributedr   vllm.loggerr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr	    vllm.model_executor.models.olmoer
   r   vllm.transformers_utils.configsr   r'   loggerr   r_   r.   rO   rZ    r&   r%   <module>rj      s   I H        " " " " " " A A A A A A # # # # # # 9 9 9 9 9 9 8 8 8 8 8 8 > > > > > > M M M M M M M M : : : : : :	X		
 
 
 
 
 
 
 
94 94 94 94 94") 94 94 94x## ## ## ## ##29 ## ## ##L
X 
X 
X 
X 
X* 
X 
X 
X 
X 
Xr&   