
     `i                        d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ  ej        e           Z! G d de          Z" G d de          Z# G d de          Z$ G d de          Z% G d de          Z& G d de          Z'g dZ(dS )zPyTorch BitNet model.    )CallableOptionalN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging)deprecate_kwarg   )GemmaMLP)LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaRMSNormapply_rotary_pos_embeager_attention_forward   )BitNetConfigc                       e Zd ZdS )BitNetRMSNormN__name__
__module____qualname__     }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/bitnet/modular_bitnet.pyr   r   +           Dr   r   c                   *     e Zd Zdef fdZd Z xZS )	BitNetMLPconfigc                     t                                          |           t          |j        |j                  | _        d S N)eps)super__init__r   intermediate_sizerms_norm_epsffn_sub_norm)selfr$   	__class__s     r    r)   zBitNetMLP.__init__0   s<       )&*BH[\\\r   c           	          |                      |                     |                     |                     |                    |                     |          z                      }|S )N)	down_projr,   act_fn	gate_projup_proj)r-   xr0   s      r    forwardzBitNetMLP.forward4   sU    NN4#4#4T[[PQARAR5S5SVZVbVbcdVeVe5e#f#fgg	r   )r   r   r   r   r)   r5   __classcell__r.   s   @r    r#   r#   /   sZ        ]| ] ] ] ] ] ]      r   r#   c                       e Zd Zdedef fdZ eddd          	 	 dd	ej        d
e	ej        ej        f         de
ej                 de
e         de
ej                 dee         de	ej        e
ej                 f         fd            Z xZS )BitNetAttentionr$   	layer_idxc                     t                                          ||           t          |j        |j                  | _        d S r&   )r(   r)   r   hidden_sizer+   attn_sub_norm)r-   r$   r:   r.   s      r    r)   zBitNetAttention.__init__:   s>    +++*6+=6CVWWWr   past_key_valuepast_key_valuesz4.58)new_nameversionNhidden_statesposition_embeddingsattention_maskcache_positionkwargsreturnc                 n   |j         d d         }g |d| j        R }|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j	        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }|                     |          }||fS )Nr   r   )sincosrE   eagerg        )dropoutscaling)shapehead_dimq_projview	transposek_projv_projr   updater:   r   r$   _attn_implementationr	   trainingattention_dropoutrN   reshape
contiguousr=   o_proj)r-   rB   rC   rD   r?   rE   rF   input_shapehidden_shapequery_states
key_statesvalue_statesrK   rJ   cache_kwargsattention_interfaceattn_outputattn_weightss                     r    r5   zBitNetAttention.forward>   s    $)#2#.88b8$-88{{=1166|DDNNqRSTT[[//44\BBLLQPQRR
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHH((55kk+..L((r   )NN)r   r   r   r   intr)   r   torchTensortupler   r   
LongTensorr
   r   r5   r6   r7   s   @r    r9   r9   9   s       X| X X X X X X X _%0A6RRR ,059+) +)|+) #5<#=>+) !.	+)
 "%+) !!12+) -.+) 
u|Xel33	4+) +) +) SR+) +) +) +) +)r   r9   c                       e Zd ZdS )BitNetDecoderLayerNr   r   r   r    rl   rl   m   r!   r   rl   c                       e Zd ZdS )BitNetModelNr   r   r   r    rn   rn   q   r!   r   rn   c                   2     e Zd ZdgZdZdZdef fdZ xZS )BitNetForCausalLMzlm_head.weightNrG   c                 6     t                      j        di |S )a$  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BitNetForCausalLM

        >>> model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T")
        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T")

        >>> prompt = f'<|begin_of_text|>User: Hey, are you conscious? Can you talk to me?<|eot_id|>Assistant: '
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=100)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "User: Hey, are you conscious? Can you talk to me?Assistant: No, I'm not conscious. I'm an artificial intelligence designed to assist with information and tasks. How can I help you today?"
        ```r   )r(   r5   )r-   super_kwargsr.   s     r    r5   zBitNetForCausalLM.forwardz   s!    4 uww.....r   )	r   r   r   _tied_weights_keys_tp_plan_pp_planr   r5   r6   r7   s   @r    rp   rp   u   sZ        *+HH/ 
 / / / / / / / / / /r   rp   )rp   rn   BitNetPreTrainedModel))__doc__typingr   r   rg   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   utils.deprecationr   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   configuration_bitnetr   
get_loggerr   loggerr   r#   r9   rl   rn   rp   __all__r   r   r    <module>r      sU     % % % % % % % %              B B B B B B 6 6 6 6 6 6 5 5 5 5 5 5 & & & & & &       0 0 0 0 0 0 + + + + + +                  / . . . . . 
	H	%	%	 	 	 	 	L 	 	 	       1) 1) 1) 1) 1)n 1) 1) 1)h	 	 	 	 	* 	 	 		 	 	 	 	* 	 	 	/ / / / /( / / /D  r   