
     `i7                        d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$  ej%        e&          Z'dZ( G d de           Z) G d de          Z* G d de          Z+ G d de          Z, G d de          Z- G d de          Z. G d d e          Z/ G d! d"e          Z0 G d# d$e          Z1 G d% d&e          Z2g d'Z3dS )(zPyTorch Qwen3 model.    )CallableOptionalN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )GemmaMLP)LlamaAttention)
Qwen2DecoderLayerQwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassification
Qwen2ModelQwen2PreTrainedModelQwen2RMSNormapply_rotary_pos_embeager_attention_forward   )Qwen3ConfigzQwen/Qwen3-8Bc                       e Zd ZdS )Qwen3RMSNormN__name__
__module____qualname__     {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/qwen3/modular_qwen3.pyr   r   4           Dr$   r   c                       e Zd ZdS )Qwen3MLPNr   r#   r$   r%   r(   r(   8   r&   r$   r(   c                       e Zd Zdedef fdZ eddd          	 	 dd	ej        d
e	ej        ej        f         de
ej                 de
e         de
ej                 dee         de	ej        e
ej                 f         fd            Z xZS )Qwen3Attentionconfig	layer_idxc                 
   t                                          ||           t          | j        |j                  | _        t          | j        |j                  | _        |j        |         dk    r|j        nd | _        d S )N)epssliding_attention)	super__init__r   head_dimrms_norm_epsq_normk_normlayer_typessliding_window)selfr+   r,   	__class__s      r%   r1   zQwen3Attention.__init__=   sy    +++"4=f6IJJJ"4=f6IJJJ7=7I)7TXk7k7kf33qur$   past_key_valuepast_key_valuesz4.58)new_nameversionNhidden_statesposition_embeddingsattention_maskcache_positionkwargsreturnc                    |j         d d         }g |d| j        R }|                     |                     |                              |                                        dd          }	|                     |                     |                              |                                        dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|
                    |
|| j        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nr   r   )sincosrA   eagerg        )dropoutscalingr7   )shaper2   r4   q_projview	transposer5   k_projv_projr   updater,   r   r+   _attn_implementationr	   trainingattention_dropoutrJ   r7   reshape
contiguouso_proj)r8   r>   r?   r@   r;   rA   rB   input_shapehidden_shapequery_states
key_statesvalue_statesrG   rF   cache_kwargsattention_interfaceattn_outputattn_weightss                     r%   forwardzQwen3Attention.forwardC   s    $)#2#.88b8$-88{{4;;}#=#=#B#B<#P#PQQ[[\]_`aa[[]!;!;!@!@!N!NOOYYZ[]^__
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7
%
  $}HCC$2HL.
%
 
%
 
%
 
%
!\ *k);;;;;;FFHHkk+..L((r$   )NN)r    r!   r"   r   intr1   r   torchTensortupler   r   
LongTensorr
   r   ra   __classcell__r9   s   @r%   r*   r*   <   s       v{ vs v v v v v v _%0A6RRR ,059*) *)|*) #5<#=>*) !.	*)
 "%*) !!12*) -.*) 
u|Xel33	4*) *) *) SR*) *) *) *) *)r$   r*   c                       e Zd ZdS )Qwen3DecoderLayerNr   r#   r$   r%   rj   rj   q   r&   r$   rj   c                       e Zd ZdS )Qwen3PreTrainedModelNr   r#   r$   r%   rl   rl   u   r&   r$   rl   c                       e Zd ZdS )
Qwen3ModelNr   r#   r$   r%   rn   rn   y   r&   r$   rn   c                   4     e Zd Zdee         def fdZ xZS )Qwen3ForCausalLMsuper_kwargsrC   c                 6     t                      j        di |S )a^  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM

        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```r#   )r0   ra   )r8   rq   r9   s     r%   ra   zQwen3ForCausalLM.forward~   s!    4 uww.....r$   )r    r!   r"   r
   r   r   ra   rg   rh   s   @r%   rp   rp   }   sU        /12/ 
 / / / / / / / / / /r$   rp   c                       e Zd ZdS )Qwen3ForSequenceClassificationNr   r#   r$   r%   rt   rt      r&   r$   rt   c                       e Zd ZdS )Qwen3ForTokenClassificationNr   r#   r$   r%   rv   rv      r&   r$   rv   c                       e Zd ZdS )Qwen3ForQuestionAnsweringNr   r#   r$   r%   rx   rx      r&   r$   rx   )rp   rx   rl   rn   rt   rv   )4__doc__typingr   r   rc   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   utils.deprecationr   gemma.modeling_gemmar   llama.modeling_llamar   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   r   r   configuration_qwen3r   
get_loggerr    logger_CHECKPOINT_FOR_DOCr   r(   r*   rj   rl   rn   rp   rt   rv   rx   __all__r#   r$   r%   <module>r      sH     % % % % % % % %              B B B B B B 6 6 6 6 6 6 5 5 5 5 5 5 & & & & & & 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + + + + + +                             - , , , , , 
	H	%	%% 	 	 	 	 	< 	 	 		 	 	 	 	x 	 	 	2) 2) 2) 2) 2)^ 2) 2) 2)j	 	 	 	 	) 	 	 		 	 	 	 	/ 	 	 		 	 	 	 	 	 	 	/ / / / /' / / /<	 	 	 	 	%C 	 	 		 	 	 	 	"= 	 	 		 	 	 	 	 9 	 	 	  r$   