
     `i=?                        d Z ddlmZmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z#  ej$        e%          Z& G d dej'                  Z( G d de           Z)d Z*d$dZ+ G d de          Z, G d de          Z- G d de          Z. G d d e          Z/ G d! d"e          Z0g d#Z1dS )%zPyTorch Cohere model.    )CallableOptionalUnionN)nn   )Cache)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)dynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )LlamaAttentionLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forward   )CohereConfigc                   &     e Zd Zd fd	Zd Z xZS )CohereLayerNormNh㈵>Fc                     t                                                       t          j        t	          j        |                    | _        || _        dS )zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeepsbias	__class__s       }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/cohere/modular_cohere.pyr!   zCohereLayerNorm.__init__6   sB    l5:k#:#:;; #    c                    |j         }|                    t          j                  }|                    dd          }||z
                      d                              dd          }||z
  t          j        || j        z             z  }| j                            t          j                  |z  }|                    |          S )NT)keepdimr   )	dtypetor#   float32meanpowrsqrtr&   r%   )r'   hidden_statesinput_dtyper4   variances        r,   forwardzCohereLayerNorm.forward<   s    #)%((77!!"d!33!D(--a0055b$5GG&-XH]=]1^1^^u}55E,,,r-   )Nr   F)__name__
__module____qualname__r!   r:   __classcell__r+   s   @r,   r   r   5   sL        $ $ $ $ $ $- - - - - - -r-   r   c                   N    e Zd Z ej                    ed                         ZdS )CohereRotaryEmbeddingc                 &   | j         d d d d f                                                             |j        d         dd          }|d d d d d f                                         }t	          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j	        |d          5  |                                |                                z  
                    dd          }t          j        |dd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r/   r   mpscpuF)device_typeenabledr   dimr1   )inv_freqfloatexpandshape
isinstancedevicetypestrr#   autocast	transposerepeat_interleavecosattention_scalingsinr2   r1   )
r'   xposition_idsinv_freq_expandedposition_ids_expandedrE   freqsembrU   rW   s
             r,   r:   zCohereRotaryEmbedding.forwardG   s    !M$4-8>>@@GGHZ[\H]_acdee ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)%;;;C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   9BEEEN)r;   r<   r=   r#   no_gradr   r:    r-   r,   rA   rA   F   s@        U]__< <  _< < <r-   rA   c                     | dd d df         }| ddd df         }t          j        | |gd                              d          }|S )N.r   r   r/   rG   )r#   stackflatten)rX   x1x2rot_xs       r,   rotate_halfrg   W   sU    	
3!8B	
319BK"b	r***22266ELr-   c                 l   | j         }|                                 } |                                }|                    |          }|                    |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }|                    |          |                    |          fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rI   )r1   rK   	unsqueezerg   r2   )	qkrU   rW   rY   unsqueeze_dimr1   q_embedk_embeds	            r,   apply_rotary_pos_embro   _   s    ( GE			A			A
--
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0G::E:""GJJUJ$;$;;;r-   c                        e Zd Z fdZ xZS )	CohereMLPc                 .   t                                          |           t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        d S )NF)r*   )	r    r!   r   Linearr(   intermediate_size	gate_projup_proj	down_projr'   configr+   s     r,   r!   zCohereMLP.__init__~   s|       4#3T5KRWXXXy!143IPUVVV4#94;KRWXXXr-   )r;   r<   r=   r!   r>   r?   s   @r,   rq   rq   }   sA        Y Y Y Y Y Y Y Y Yr-   rq   c                   .    e Zd ZdZddedee         f fdZ eddd	          	 	 dd
e	j
        dee	j
        e	j
        f         dee	j
                 dee         dee	j                 dee         dee	j
        ee	j
                 f         fd            Z xZS )CohereAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNry   	layer_idxc                    t                                          ||           |j        | _        | j        rPt          |j        | j        f|j                  | _        t          |j        | j        f|j                  | _	        d S d S )Nr(   r)   )
r    r!   use_qk_normr   num_attention_headshead_dimlayer_norm_epsq_normnum_key_value_headsk_normr'   ry   r|   r+   s      r,   r!   zCohereAttention.__init__   s    +++!- 	)#7GVMb  DK *#7GVMb  DKKK	 	r-   past_key_valuepast_key_values4.58new_nameversionr7   position_embeddingsattention_maskcache_positionkwargsreturnc                    |j         d d         }g |d| j        R }|                     |                              |          }	|                     |                              |          }
|                     |                              |          }| j        r*|                     |	          }	|                     |
          }
|		                    dd          }	|
	                    dd          }
|	                    dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j        |          \  }
}t          }| j        j        dk    rt           | j        j                 } || |	|
||f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nr/   r   r   )rW   rU   r   eagerg        )dropoutscaling)rM   r   q_projviewk_projv_projr   r   r   rS   ro   updater|   r   ry   _attn_implementationr   trainingattention_dropoutr   reshape
contiguouso_proj)r'   r7   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesrU   rW   cache_kwargsattention_interfaceattn_outputattn_weightss                     r,   r:   zCohereAttention.forward   s    $)#2#.88b8$-88{{=1166|DD[[//44\BB
{{=1166|DD 	1;;|44LZ00J#--a33))!Q//
#--a33&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r-   N)NN)r;   r<   r=   __doc__r   r   intr!   r   r#   Tensortupler   
LongTensorr   r	   r:   r>   r?   s   @r,   r{   r{      s       GG
 
| 
 
 
 
 
 
 
 _%0A6RRR ,0591) 1)|1) #5<#=>1) !.	1)
 "%1) !!121) -.1) 
u|Xel33	41) 1) 1) SR1) 1) 1) 1) 1)r-   r{   c                   t    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	ej
                 de	eej        ej        f                  dee         deej        e	eej        ej        f                  f         fd            Z xZS )CohereDecoderLayerry   r|   c                     t                                                       |j        | _        t          ||          | _        t          |          | _        t          |j        |j                  | _	        d S )N)ry   r|   r~   )
r    r!   r(   r{   	self_attnrq   mlpr   r   input_layernormr   s      r,   r!   zCohereDecoderLayer.__init__   si    !-()LLLV$$.F<NU[Ujkkkr-   r   r   r   r   NFr7   r   rY   	use_cacher   r   r   r   c                     |}	|                      |          } | j        d|||||||d|\  }
}|                     |          }|	|
z   |z   }|S )ar  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r7   r   rY   r   r   r   r   r_   )r   r   r   )r'   r7   r   rY   r   r   r   r   r   residualhidden_states_attention_hidden_states_mlps                r,   r:   zCohereDecoderLayer.forward   s    > !,,];;%3T^ 	&
')%+) 3	&
 	&
 	&
 	&
" !HH]33 #::=NNr-   )NNNFNN)r;   r<   r=   r   r   r!   r   r#   r   r   r   r   boolr   r   r	   FloatTensorr:   r>   r?   s   @r,   r   r      sU       l| l l l l l l l _%0A6RRR 2637+/$)59KO. .|. !.. u/0	.
 "%. D>. !!12. &eEL%,,F&GH. -.. 
u (51BEDU1U+V"WW	X. . . SR. . . . .r-   r   c                   $     e Zd Zdef fdZ xZS )CohereModelry   c                 $   t                                                     t          j        fdt	          j                  D                       | _        t                    | _        t          j
        j                  | _        d S )Nc                 0    g | ]}t          |          S r_   )r   ).0r|   ry   s     r,   
<listcomp>z(CohereModel.__init__.<locals>.<listcomp>  s$    dddy	22dddr-   )ry   r~   )r    r!   r   
ModuleListrangenum_hidden_layerslayersrA   
rotary_embr   r(   r   normrx   s    `r,   r!   zCohereModel.__init__  s       mddddE&JbDcDcddd
 
 0v>>>#1C&J_```			r-   )r;   r<   r=   r   r!   r>   r?   s   @r,   r   r     sO        a| a a a a a a a a a ar-   r   c                   r    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deee	e
ej                 f                  deej                 d	eej                 d
ee         dee         dee         deej                 deeej        f         dee         defdZ xZS )CohereForCausalLMc                     t                                          |           t          |          | _        |j        | _        |j        | _        d S r   )r    r!   r   modellogit_scaletie_word_embeddingsrx   s     r,   r!   zCohereForCausalLM.__init__  sF        ((
!-#)#=   r-   Nr   	input_idsr   rY   r   inputs_embedslabelsr   output_attentionsoutput_hidden_statesr   logits_to_keepr   r   c                    ||n| j         j        }|	|	n| j         j        }	 | j        d||||||||	|
d	|}|j        }t          |t                    rt          | d          n|}|                     |dd|ddf                   }|| j	        z  }d}| | j
        d||| j         j        d|}t          |||j        |j        |j                  S )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, CohereForCausalLM

        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r   r   rY   r   r   r   r   r   r   )logitsr   
vocab_size)lossr   r   r7   
attentionsr_   )ry   r   r   r   last_hidden_staterN   r   slicelm_headr   loss_functionr   r   r   r7   r   )r'   r   r   rY   r   r   r   r   r   r   r   r   r   outputsr7   slice_indicesr   r   s                     r,   r:   zCohereForCausalLM.forward  s>   J 2C1N--TXT_Tq$8$D  $+Jj 	
 ,64: ,
)%+'/!5),
 ,
 ,
 ,
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA$**%4%pVFt{OeppioppD%#3!/)
 
 
 	
r-   )NNNNNNNNNNr   )r;   r<   r=   r!   r   r#   r   r   r   r   listr   r   r   r   r   r   r:   r>   r?   s   @r,   r   r     sn       > > > > > 151537KO59-1$(,0/35934H
 H
E,-H
 !.H
 u/0	H

 "%tE4E/F(F"GHH
   12H
 )*H
 D>H
 $D>H
 'tnH
 !!12H
 c5</0H
 +,H
 
 H
 H
 H
 H
 H
 H
 H
 H
r-   r   )r   r   CoherePreTrainedModel)Nr   )2r   typingr   r   r   r#   r   cache_utilsr   modeling_flash_attention_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   utils.deprecationr   llama.modeling_llamar   r   r   r   r   r   configuration_coherer   
get_loggerr;   loggerModuler   rA   rg   ro   rq   r{   r   r   r   __all__r_   r-   r,   <module>r      s  .   , , , , , , , , , ,                    B B B B B B 9 9 9 9 9 9 O O O O O O O O 6 6 6 6 6 6 5 5 5 5 5 5 & & & & & & 0 0 0 0 0 0 0 0 0 0 0 0 0 0                / . . . . . 
	H	%	%- - - - -bi - - -"< < < < <0 < < <"  < < < <<Y Y Y Y Y Y Y YA) A) A) A) A)n A) A) A)H7 7 7 7 73 7 7 7ta a a a a* a a aO
 O
 O
 O
 O
( O
 O
 O
d  r-   