
     `i_                        d dl mZmZmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'  G d dej(                  Z) G d dej(                  Z* G d dej(                  Z+dej,        de-dej,        fdZ.	 d5dej(        d ej,        d!ej,        d"ej,        d#eej,                 d$e/d%e/d&ee         fd'Z0d( Z1d6d)Z2 G d* d+ej(                  Z3 G d, d-e          Z4e  G d. d/e                      Z5e  G d0 d1e5                      Z6e  G d2 d3e5e                      Z7g d4Z8dS )7    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )CohereConfigc                   &     e Zd Zd fd	Zd Z xZS )CohereLayerNormNh㈵>Fc                     t                                                       t          j        t	          j        |                    | _        || _        dS )zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeepsbias	__class__s       ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/cohere/modeling_cohere.pyr"   zCohereLayerNorm.__init__4   sB    l5:k#:#:;; #    c                    |j         }|                    t          j                  }|                    dd          }||z
                      d                              dd          }||z
  t          j        || j        z             z  }| j                            t          j                  |z  }|                    |          S )NT)keepdim   )	dtypetor$   float32meanpowrsqrtr'   r&   )r(   hidden_statesinput_dtyper6   variances        r-   forwardzCohereLayerNorm.forward:   s    #)%((77!!"d!33!D(--a0055b$5GG&-XH]=]1^1^^u}55E,,,r.   )Nr   F__name__
__module____qualname__r"   r<   __classcell__r,   s   @r-   r   r   3   sL        $ $ $ $ $ $- - - - - - -r.   r   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )CohereRotaryEmbeddinginv_freqNconfigc                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultrE   F)
persistent)r!   r"   hasattr
isinstancerH   dictgetrI   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrF   r   rope_init_fnattention_scalingregister_bufferrE   original_inv_freq)r(   rF   devicerE   r,   s       r-   r"   zCohereRotaryEmbedding.__init__G   s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r.   c                 &   | j         d d d d f                                                             |j        d         dd          }|d d d d d f                                         }t	          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j	        |d          5  |                                |                                z  
                    dd          }t          j        |dd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r0   r   mpscpuF)device_typeenabledr2   dimr3   )rE   floatexpandshaperN   rX   rJ   strr$   autocast	transposerepeat_interleavecosrU   sinr4   r3   )
r(   xposition_idsinv_freq_expandedposition_ids_expandedr\   freqsembrh   ri   s
             r-   r<   zCohereRotaryEmbedding.forwardX   s    !M$4-8>>@@GGHZ[\H]_acdee ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)%;;;C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   9BEEEN)r>   r?   r@   r$   Tensor__annotations__r   r"   no_gradr   r<   rA   rB   s   @r-   rD   rD   D   s         l/ /| / / / / / /" U]__< <  _< < < < <r.   rD   c                   $     e Zd Z fdZd Z xZS )	CohereMLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        t          |j                 | _        d S NFr+   )r!   r"   rF   r)   intermediate_sizer   Linear	gate_projup_proj	down_projr   
hidden_actact_fnr(   rF   r,   s     r-   r"   zCohereMLP.__init__i   s    !-!'!94#3T5KRWXXXy!143IPUVVV4#94;KRWXXXV./r.   c                     |                      |                     |                     |                    |                     |          z            }|S rp   )r}   r   r{   r|   )r(   rj   r}   s      r-   r<   zCohereMLP.forwards   sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r.   r=   rB   s   @r-   ru   ru   h   sG        0 0 0 0 0      r.   ru   r9   n_repreturnc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rc   rb   reshape)r9   r   batchnum_key_value_headsslenhead_dims         r-   	repeat_kvr   x   s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr.           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr2   r   r0   )r_   r3   )ptrainingr   )r   num_key_value_groupsr$   matmulrf   rc   r   
functionalsoftmaxr5   r4   r3   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r-   eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r.   c                     | dd d df         }| ddd df         }t          j        | |gd                              d          }|S )N.r2   r   r0   r^   r   )r$   stackflatten)rj   x1x2rot_xs       r-   rotate_halfr      sU    	
3!8B	
319BK"b	r***22266ELr.   c                 l   | j         }|                                 } |                                }|                    |          }|                    |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }|                    |          |                    |          fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r`   )r3   ra   	unsqueezer   r4   )	qkrh   ri   rk   unsqueeze_dimr3   q_embedk_embeds	            r-   apply_rotary_pos_embr      s    ( GE			A			A
--
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0G::E:""GJJUJ$;$;;;r.   c                   .    e Zd ZdZddedee         f fdZ eddd	          	 	 dd
e	j
        dee	j
        e	j
        f         dee	j
                 dee         dee	j                 dee         dee	j
        ee	j
                 f         fd            Z xZS )CohereAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrF   	layer_idxc                 t   t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        |j        z  | _	        | j        dz  | _
        |j        | _        d| _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        | j        z  |j        |j                  | _        |j        | _        | j        rPt+          |j        | j        f|j                  | _        t+          |j        | j        f|j                  | _        d S d S )Nr   g      Trx   r)   r*   )r!   r"   rF   r   getattrr)   num_attention_headsr   r   r   r   attention_dropout	is_causalr   rz   attention_biasq_projk_projv_projo_projuse_qk_normr   layer_norm_epsq_normk_normr(   rF   r   r,   s      r-   r"   zCohereAttention.__init__   s   "
F4F&Jd4dee$*$>&B\$\!}d*!'!9i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i&68JQWQf
 
 
 "- 	)#7GVMb  DK *#7GVMb  DKKK	 	r.   past_key_valuepast_key_values4.58new_nameversionr9   position_embeddingsr   cache_positionr   r   c                    |j         d d         }g |d| j        R }|                     |                              |          }	|                     |                              |          }
|                     |                              |          }| j        r*|                     |	          }	|                     |
          }
|		                    dd          }	|
	                    dd          }
|	                    dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j        |          \  }
}t          }| j        j        dk    rt           | j        j                 } || |	|
||f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nr0   r   r2   )ri   rh   r   eagerr   )r   r   )rc   r   r   viewr   r   r   r   r   rf   r   updater   r   rF   _attn_implementationr   r   r   r   r   r   r   )r(   r9   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rh   ri   cache_kwargsattention_interfacer   r   s                     r-   r<   zCohereAttention.forward   s    $)#2#.88b8$-88{{=1166|DD[[//44\BB
{{=1166|DD 	1;;|44LZ00J#--a33))!Q//
#--a33&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r.   rp   )NN)r>   r?   r@   __doc__r   r   intr"   r   r$   rq   tupler	   
LongTensorr   r   r<   rA   rB   s   @r-   r   r      s       GG |       @ _%0A6RRR ,0591) 1)|1) #5<#=>1) !.	1)
 "%1) !!121) -.1) 
u|Xel33	41) 1) 1) SR1) 1) 1) 1) 1)r.   r   c                   t    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	ej
                 de	eej        ej        f                  dee         deej        e	eej        ej        f                  f         fd            Z xZS )CohereDecoderLayerrF   r   c                     t                                                       |j        | _        t          ||          | _        t          |          | _        t          |j        |j                  | _	        d S )N)rF   r   r   )
r!   r"   r)   r   	self_attnru   mlpr   r   input_layernormr   s      r-   r"   zCohereDecoderLayer.__init__  si    !-()LLLV$$.F<NU[Ujkkkr.   r   r   r   r   NFr9   r   rk   	use_cacher   r   r   r   c                     |}	|                      |          } | j        d|||||||d|\  }
}|                     |          }|	|
z   |z   }|S )ar  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r9   r   rk   r   r   r   r    )r   r   r   )r(   r9   r   rk   r   r   r   r   r   residualhidden_states_attention_hidden_states_mlps                r-   r<   zCohereDecoderLayer.forward$  s    > !,,];;%3T^ 	&
')%+) 3	&
 	&
 	&
 	&
" !HH]33 #::=NNr.   )NNNFNN)r>   r?   r@   r   r   r"   r   r$   rq   r   r   r	   boolr   r   r   FloatTensorr<   rA   rB   s   @r-   r   r     sU       l| l l l l l l l _%0A6RRR 2637+/$)59KO. .|. !.. u/0	.
 "%. D>. !!12. &eEL%,,F&GH. -.. 
u (51BEDU1U+V"WW	X. . . SR. . . . .r.   r   c                   L    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )CoherePreTrainedModelrF   modelTr   r   )r9   
attentionsN)r>   r?   r@   r   rr   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr   r.   r-   r   r   V  sl         &*#-.#4"5N!"&+% r.   r   c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 dee         deej                 d	eej	                 d
ee         dee         defd                        Z xZS )CohereModelrF   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        t!                    | _        d| _        |                                  d S )Nc                 0    g | ]}t          |          S r   )r   ).0r   rF   s     r-   
<listcomp>z(CohereModel.__init__.<locals>.<listcomp>r  s$    dddy	22dddr.   r   rF   F)r!   r"   pad_token_idpadding_idx
vocab_sizer   	Embeddingr)   embed_tokens
ModuleListrangenum_hidden_layerslayersr   r   normrD   
rotary_embgradient_checkpointing	post_initr   s    `r-   r"   zCohereModel.__init__k  s       !. +L):F<NPTP`aamddddE&JbDcDcddd
 
 $1C&J_```	/v>>>&+# 	r.   N	input_idsr   rk   r   inputs_embedsr   r   r   r   c           
      N   |d u |d uz  rt          d          ||                     |          }|r|t          | j                  }|B||                                nd}	t          j        |	|	|j        d         z   |j                  }||	                    d          }t          | j        |||||          }
|}|                     ||          }| j        d | j        j                 D ]} ||f|
||||d|}|                     |          }t          ||          S )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )rX   )rF   input_embedsr   r   r   rk   )r   rk   r   r   r   )last_hidden_stater   )
ValueErrorr  r
   rF   get_seq_lengthr$   arangerc   rX   r   r   r	  r  r  r  r   )r(   r  r   rk   r   r  r   r   r   past_seen_tokensr   r9   r   decoder_layers                 r-   r<   zCohereModel.forward{  s    -t";< 	[YZZZ *.*;*;I*F*FM 	?0*$+>>>O!CRC^==???de+0< "2]5H5K"KTaTh, , ,N )33A66L(;&))+%
 
 
 &"oom\JJ![)H4;+H)HI 		 		M)M*) /-$7   MM 		-00&++
 
 
 	
r.   )NNNNNNN)r>   r?   r@   r   r"   r   r   r   r$   r   rq   r	   r   r   r   r   r   r<   rA   rB   s   @r-   r   r   i  s       |         151537+/5959$(8
 8
E,-8
 !.8
 u/0	8

 "%8
   128
 !!128
 D>8
 +,8
 
!8
 8
 8
 ^ 8
 8
 8
 8
 8
r.   r   c                       e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 	 dd	e	e
j                 d
e	e
j                 de	e
j                 de	eeee
j                 f                  de	e
j                 de	e
j                 de	e         de	e         de	e         de	e
j                 deee
j        f         dee         defd                        Z xZS )CohereForCausalLMzlm_head.weightlm_headcolwise_repr9   logitsc                 .   t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        |j	        | _	        |j
        | _
        |                                  d S rw   )r!   r"   r   r   r  r   rz   r)   r  logit_scaletie_word_embeddingsr  r   s     r-   r"   zCohereForCausalLM.__init__  s        ((
 +y!3V5FUSSS!-#)#=  	r.   Nr   r  r   rk   r   r  labelsr   output_attentionsoutput_hidden_statesr   logits_to_keepr   r   c                    ||n| j         j        }|	|	n| j         j        }	 | j        d||||||||	|
d	|}|j        }t          |t                    rt          | d          n|}|                     |dd|ddf                   }|| j	        z  }d}| | j
        d||| j         j        d|}t          |||j        |j        |j                  S )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, CohereForCausalLM

        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r  r   rk   r   r  r   r  r   r   )r  r  r  )lossr  r   r9   r   r   )rF   r  r   r   r  rN   r   slicer  r  loss_functionr  r   r   r9   r   )r(   r  r   rk   r   r  r  r   r  r   r   r!  r   outputsr9   slice_indicesr  r#  s                     r-   r<   zCohereForCausalLM.forward  s>   N 2C1N--TXT_Tq$8$D  $+Jj 	
 ,64: ,
)%+'/!5),
 ,
 ,
 ,
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA$**%4%pVFt{OeppioppD%#3!/)
 
 
 	
r.   )NNNNNNNNNNr   )r>   r?   r@   _tied_weights_keys_tp_plan_pp_planr"   r   r   r   r$   r   rq   r   r	   listr   r   r   r   r   r   r<   rA   rB   s   @r-   r  r    s       *+=)H_-z:;H	 	 	 	 	  151537KO59-1$(,0/35934H
 H
E,-H
 !.H
 u/0	H

 "%tE4E/F(F"GHH
   12H
 )*H
 D>H
 $D>H
 'tnH
 !!12H
 c5</0H
 +,H
 
 H
 H
 H
 ^ H
 H
 H
 H
 H
r.   r  )r  r   r   )r   )Nr   )9typingr   r   r   r$   r   activationsr   cache_utilsr	   r
   
generationr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   configuration_coherer   Moduler   rD   ru   rq   r   r   ra   r   r   r   r   r   r   r   r  __all__r   r.   r-   <module>r=     s  < - , , , , , , , , ,        ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) / / / / / / B B B B B B 9 9 9 9 9 9 O O O O O O O O K K K K K K K K F F F F F F F F & & & & & & I I I I I I I I I I 0 0 0 0 0 0 / / / / / / . . . . . .- - - - -bi - - -"!< !< !< !< !<BI !< !< !<H    	    	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4  < < < <<U) U) U) U) U)bi U) U) U)p7 7 7 7 73 7 7 7t     O   $ K
 K
 K
 K
 K
' K
 K
 K
\ Z
 Z
 Z
 Z
 Z
- Z
 Z
 Z
z H
G
Gr.   