
    ePi                        d Z ddlmZmZmZ ddlZddlmZm	Z	m
Z
  G d de
j                  Z G d de
j                  Z G d	 d
e
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  ZdS )z0Declares specification of the Transformer model.    )OptionalTupleUnionN)attention_speccommon_spec
model_specc            /       ,   e Zd Zddej        j        dej        j        ddddddddddddddddfdedede	d	e	d
ej        dedej        de	de	de	de	de	de	de
e         de
e         de
e         de	de
ej                 dedede
e         de
e	         de	f.dZdS )TransformerEncoderSpecTF   N'  
num_layers	num_headspre_normno_final_norm
activationnum_source_embeddingsembeddings_mergelayernorm_embeddingrelative_positionrelative_attention_biasffn_glurms_normmulti_query_attentionnum_heads_kvhead_dim
rotary_dimrotary_interleaverotary_scaling_typerotary_scaling_factorrotary_basesliding_windowqk_normpre_post_layer_normc                   	
 |rdk    rt          d          d|| _        t          j        d                              |          | _        || _        t          j        d                              |          | _        t          j        d                              |          | _        d t          |          D             | _
        d| _        	s
st                      | _        |r|st          j                  | _        |rt          j                  | _        |,t          j        d	                              |          | _        
	fd
t          |          D             | _        dS )a'  Initializes a Transformer encoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of dimensions per attention head.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          sliding_window: Max sequence length to retain in KV Cache.
          qk_norm: Apply layer normalization to the query and key projections.
          pre_post_layer_norm: Add post layer norm for each pre norm layer.
        Nr   5Enabling multi_query_attention implies num_heads_kv=1int16int8c                 4    g | ]}t          j                    S  )r   EmbeddingsSpec).0_s     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ctranslate2/specs/transformer_spec.py
<listcomp>z3TransformerEncoderSpec.__init__.<locals>.<listcomp>S   s.     
 
 
-.K&((
 
 
    Tr   int32c                 H    g | ]}t          	
           S ))r   r   r   r   r   r   r   r   r   r   r    r"   r#   )TransformerEncoderLayerSpec)r+   r,   r   r   r   r#   r"   r   r   r   r    r   r   r   r   s     r-   r.   z3TransformerEncoderSpec.__init__.<locals>.<listcomp>`   sa     
 
 
   ("3(?!)!%"3$7&;'$7  
 
 
r/   )
ValueErrorr   npdtypetyper   r   r   r   range
embeddingsscale_embeddingsPositionEncoderSpecposition_encodingsr   LayerNormSpec
layer_normr   r!   layer)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   s            ```` ``````` ``r-   __init__zTransformerEncoderSpec.__init__   s   x ! 	'LA,=,= K   L%:"'**//	:: (6**//
;; " 0 0 5 56F G G
 
278M2N2N
 
 
 !%  	<)@ 	<&9&;&;D# 	KM 	K)7JJJDO 	T'2'@('S'S'SD$%"$(7"3"3"8"8"H"HD
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  :&&!
 
 



r/   )__name__
__module____qualname__r   
ActivationRELUEmbeddingsMergeCONCATintboolr   r   RotaryScalingTypefloatrA   r)   r/   r-   r
   r
   
   s       
 #-8-C-H%&8C8S8Z$)"'(-&+&*"&$("&JN'("(,"'$)1f
 f
f
 f
 	f

 f
  *f
  #f
 &5f
 "f
  f
 "&f
 f
 f
  $f
 smf
  3-!f
" SM#f
$  %f
& &n&FG'f
(  %)f
* +f
, !-f
. $/f
0 "1f
 f
 f
 f
 f
 f
r/   r
   c            J          e Zd Zdej        j        ddddddddddddddddddddddddddddddddf"ded	ed
edej        dedededededededededededededee         dedee	j
                 dedededed ed!ed"ed#ed$ee         d%ee         d&ee         d'eej                 d(ee         d)ee         d*ed+ee         fHd,Zed-             ZdS ).TransformerDecoderSpecTFr   Nr   r   r   r   r   r   r   with_encoder_attentionr   project_in_outr   r   alignment_layeralignment_headsr   r   alibialibi_use_positive_positionsscale_alibir   r   r   r   r     original_max_position_embeddingsmax_position_embeddingsparallel_residualshared_layer_normr#   r   r   r   r!   
quant_typequant_group_size
quant_bitsr"    external_pre_post_encoder_layersc%                 <  	
#$ t                      | _        r"|st          d          rt          d          |rdk    rt          d          dt          j        d                              |          | _        || _        t          j        d                              |          | _        t          j        d                              |          | _	        t          j        d                              |          | _
        t          j                    | _        d| _        t          j        | _        || _        || _        || _        ,t          j        d	                                        | _        	s
s|st-                      | _        |r|st          j        
          | _        |rt          j        
          | _        t          j                    | _        $#
	fdt;          |          D             | _        d| _        |p|k    | j        d<   |r0t          j                    | _         t          j                    | _!        | r | | j        d<   |"| j        d<   |!| j        d<   dS dS )a.  Initializes a Transformer decoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          with_encoder_attention: Enable the encoder attention sublayers.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          project_in_out: Add linear transformations after the embedding layer and before
            the final layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: Add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          sliding_window: Max sequence length to retain in KV Cache.
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
          external_pre_post_encoder_layers: if the encoder attention pre and processing
            is done outside the attention.
        z/The GPT-J block expects a pre-norm architecturez-The GPT-J block does not have cross attentionNr   r%   r&   r'   Tr1   r0   c                     g | ]H}t          di d ddddddddd	d
dddd	dddd
dIS )rP   r   r   r   r   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r!   r"   r^   r)   )TransformerDecoderLayerSpec)r+   r,   r^   r   r   rX   r   rW   rY   r#   r"   r   r   r   r    r   r   r   r   rZ   r!   rP   s     r-   r.   z3TransformerDecoderSpec.__init__.<locals>.<listcomp>   s6    
 
 
. - (   '='="3"3 )@(?  	
 " &: #4"3 %8$7 '<&; (K 2R1Q )@(? #4"3 #4"3 %8$7  *\!" "#$  .~%&  '( 2R1Q)
 
 
r/   Fr   quantization_typequantization_bitsquantization_group_size)"dict_configr4   r5   r6   r7   r   r   r   rR   rS   r   r*   r9   r:   r   OPTIONALscale_outputsrT   rU   rV   r!   r;   r<   r=   r>   r   
LinearSpec
projectionr8   r?   start_from_zero_embedding
project_inproject_out)%r@   r   r   r   r   r   rP   r   rQ   r   r   rR   rS   r   r   rT   rU   rV   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r   r!   r[   r\   r]   r"   r^   s%         `  ``  ``   `````````` ```   ``r-   rA   zTransformerDecoderSpec.__init__u   s"   p vv 	R T !RSSS% R !PQQQ  	'LA,=,= K   L'**//	:: (6**//
;;!x0055oFF!x0055oFF%466 $'0
,H)&%"$(7"3"3"8"8"H"HD!	<+	< 	< "&9&;&;D# 	KM 	K)7JJJDO 	T'2'@('S'S'SD$%022
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
. :&&/
 
 

2 */&0E 1
I% 	,-  	8)466DO*577D 	G0:DL,-0:DL,-6FDL2333	G 	Gr/   c                     | j         S N)rf   r@   s    r-   configzTransformerDecoderSpec.config  s
    |r/   )rB   rC   rD   r   rE   rF   rI   rJ   r   r   rK   rL   QuantizationrA   propertyrq   r)   r/   r-   rN   rN   t   s       
 -8-C-H$)'+#$"'(-! -2!$("&JN'("01'("'"'$)&+&*"&(,9=*.$(;@KdG dGdG dG 	dG
  *dG "dG !%dG dG dG  dG "&dG dG dG dG dG  !dG" '+#dG$ %dG& SM'dG(  )dG* &n&FG+dG,  %-dG. /dG0 +.1dG2 "%3dG4  5dG6  7dG8 "9dG:  $;dG< sm=dG> 3-?dG@ !AdGB [56CdGD #3-EdGF SMGdGH IdGJ +34.KdG dG dG dGL   X  r/   rN   c                   n    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee         dedeej                 d	ed
edefdZ	dS )r3   FNTr   r   r   r   r   r   r    r#   c                    t          j        d||||||||	|
|||          | _        t          ||          | _        |rt          j        |          | _        t          j        |          | _        t          j        |          | _	        t          j        |          | _
        t          | j        d           t          | j        d           d S d S )NT)self_attentionr   r   r   r   r   r!   r   r   r   r   r    r"   glur   r0   r>   )r   MultiHeadAttentionSpecrv   FeedForwardSpecffnr   r=   input_layer_normpost_attention_layer_normpre_feedforward_layer_normpost_feedforward_layer_normdelattr)r@   r   r   r   r   r   r   r!   r   r   r   r   r    r"   r#   s                  r-   rA   z$TransformerEncoderLayerSpec.__init__!  s   " -C/$;%)!/ 3"7#
 
 
 #wBBB 	,$/$=x$P$P$PD!-8-F!. . .D* /:.G!/ / /D+ 0;/H!0 0 0D, D'666DHl+++++	, 	,r/   )FFFFNNNNTNr   r   FF)
rB   rC   rD   r   rI   rJ   r   rK   rL   rA   r)   r/   r-   r3   r3      s           %$("&JN'("$)/, /, SM/,  /, &n&FG/,  %/, /, "/, /, /, /, /, /,r/   r3   c                   >    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZdS )	ra   TFNr   r   r   c                 R   t          j        d|||||||	|
||||||          | _        |r!t          j        ||||||du           | _        t	          ||          | _        |ru|rt          j                    | _        n0t          j                    | _	        t          j                    | _
        t          | j        d           t          | j        d           |rt          j        |          | _	        t          j        |          | _
        |r6|r4t          j        |          | _        t          j        |          | _        t          j        |          | _        t          j        |          | _        t          | j        d           t          | j        d           d S d S )NT)rv   r   r   r   r   r   r   r   r    rW   rX   r   r   r!   r"   F)r   r   r   r!   r"   has_normrw   r>   r0   )r   ry   rv   	attentionrz   r{   r   r=   rZ   r|   r}   r   *external_post_encoder_attention_layer_norm)external_pre_encoder_attention_layer_normr~   r   )r@   rP   r   r   r   r   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r!   r"   r^   s                        r-   rA   z$TransformerDecoderLayerSpec.__init__T  s   . -C/$;!/ 3"7#-M$;%)
 
 
$ " 	+B!)!-9UB  DN #wBBB 	,  M)4)B)D)D&&(3(A(C(C%1<1J1L1L.D'666DHl+++ 	,$/$=x$P$P$PD!-8-F!. . .D* & *J -x@@@ ?  -x@@@ >
 /:.G!/ / /D+ 0;/H!0 0 0D, D'666DHl+++++1	, 	,r/   )TFFFFNTNr   r   r   r   FFFNNNFFrB   rC   rD   rA   r)   r/   r-   ra   ra   S  sl          $ % )* !!).+W, W, W, W, W, W,r/   ra   c                       e Zd ZddZdS )rz   Fc                     t          j        |          | _        t          j                    | _        t          j                    | _        |rt          j                    | _        d S d S )Nr0   )r   r=   r>   ri   linear_0linear_1linear_0_noact)r@   rx   r   s      r-   rA   zFeedForwardSpec.__init__  s`    %3XFFF#.00#.00 	;"-"8":":D	; 	;r/   N)FFr   r)   r/   r-   rz   rz     s(        ; ; ; ; ; ;r/   rz   c                       e Zd Zd ZdS )r;   c                 (    t           j        | _        d S ro   )r   rg   	encodingsrp   s    r-   rA   zPositionEncoderSpec.__init__  s    #,r/   Nr   r)   r/   r-   r;   r;     s#        - - - - -r/   r;   c                   6     e Zd ZdZddee         f fdZ xZS )TransformerConfigz%Configuration for Transformer models.Nlayer_norm_epsilonc                 >     t                      j        dd|i| dS )zInitializes the configuration for Transformer models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   Nr)   superrA   r@   r   kwargs	__class__s      r-   rA   zTransformerConfig.__init__  .     	II,>I&IIIIIr/   ro   rB   rC   rD   __doc__r   rL   rA   __classcell__r   s   @r-   r   r     s_        //J J8E? J J J J J J J J J Jr/   r   c                    6    e Zd ZdZdedef fdZedddej	        j
        dddej        j        dddddfd	eeeeef         f         d
ededededej	        dedededej        dededededefd            Zed             Zed             Zd Zd Zd Z xZS )TransformerSpeczDescribes a Transformer model.

    The specification is invariant to hidden dimensions but requires to
    explicitly set the number of layers and attention heads.
    encoderdecoderc                 >   t          |t                    st          d          t          |t                    st          d          t	                                                       || _        || _        | j        	                    d| j        j
                   dS )zInitializes a Transformer model specification.

        Args:
          encoder: The encoder specification.
          decoder: The decoder specification.
        1encoder argument must be a TransformerEncoderSpec1decoder argument must be a TransformerDecoderSpecr   N)
isinstancer
   	TypeErrorrN   r   rA   r   r   rf   add_attributer   )r@   r   r   r   s      r-   rA   zTransformerSpec.__init__  s     '#9:: 	QOPPP'#9:: 	QOPPP""#T\%G	
 	
 	
 	
 	
r/   FTrO   r   r   r   with_relative_positionr   r   r   rR   rS   r   r   r   r   r   r   r   c                     t          |t          t          f          r|\  }}n||}}t          ||||||	|
||||||          }t	          |||||||||||||          } | ||          S )a  Creates a Transformer model specification.

        Args:
          num_layers: Number of encoder and decoder layers, or a 2-tuple if the
            number is different.
          num_heads: Number of attention heads.
          with_relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layer as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention.
        )r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   rR   rS   r   r   r   )r   listtupler
   rN   )clsr   r   r   r   r   r   rR   rS   r   r   r   r   r   r   r   num_encoder_layersnum_decoder_layersr   r   s                       r-   from_configzTransformerSpec.from_config  s    V j4-00 	L5?2 2 25? 2('!"7- 34$;"7
 
 
  )'! 34$;++"7
 
 
  s7G$$$r/   c                     dS )Nr   r)   rp   s    r-   namezTransformerSpec.name7  s      r/   c                     dS )N   r)   rp   s    r-   revisionzTransformerSpec.revision;      qr/   c                     t                      S ro   )r   rp   s    r-   get_default_configz"TransformerSpec.get_default_config?  s     """r/   c                 .    d | j         j        D             S )Nc                 2    g | ]}|j         j        d          S )r   )weightshape)r+   specs     r-   r.   z>TransformerSpec.get_source_vocabulary_size.<locals>.<listcomp>C  s"    III!!$IIIr/   )r   r9   rp   s    r-   get_source_vocabulary_sizez*TransformerSpec.get_source_vocabulary_sizeB  s    II1HIIIIr/   c                 :    | j         j        j        j        d         S Nr   r   r9   r   r   rp   s    r-   get_target_vocabulary_sizez*TransformerSpec.get_target_vocabulary_sizeE      |&-3A66r/   )rB   rC   rD   r   r
   rN   rA   classmethodr   rE   rF   rG   rH   r   rI   r   rJ   r   rs   r   r   r   r   r   r   r   s   @r-   r   r     s        
-
8N
 
 
 
 
 
* 
 (-#-8-C-H! %&8C8S8Z$)(-&+!O% O%#uS#X./O% O% !%	O%
 O% O%  *O% O% O%  #O% &5O% "O% "&O% O% O%   $!O% O% O% [O%b ! ! X!   X# # #J J J7 7 7 7 7 7 7r/   r   c                   6     e Zd ZdZddee         f fdZ xZS )TransformerDecoderModelConfigz-Configuration for Transformer decoder models.Nr   c                 >     t                      j        dd|i| dS )zInitializes the configuration for Transformer decoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   Nr)   r   r   s      r-   rA   z&TransformerDecoderModelConfig.__init__L  r   r/   ro   r   r   s   @r-   r   r   I  _        77J J8E? J J J J J J J J J Jr/   r   c            @           e Zd ZdZdef fdZedej        j	        ddddddddddddddd	d	dddddddddddfd
e
de
dedej        dedededededededededee
         dedeej                 dedede
de
deded ed!ed"ee
         d#ee
         d$ee
         d%eej                 d&ee
         d'ee
         d(ef>d)            Zed*             Zed+             Zd, Zd- Z xZS ).TransformerDecoderModelSpecz3Describes a Transformer decoder model (e.g. GPT-2).r   c                    t          |t                    st          d          t                                                       || _        | j        j                                        D ] \  }}| j        	                    ||           !dS )z|Initializes a Transformer decoder model specification.

        Args:
          decoder: The decoder specification.
        r   N)
r   rN   r   r   rA   r   rq   itemsrf   r   )r@   r   keyvaluer   s       r-   rA   z$TransformerDecoderModelSpec.__init__Y  s     '#9:: 	QOPPP,-3355 	3 	3JCL&&sE2222	3 	3r/   TFNr   r   r   r   r   r   r   r   r   rQ   r   r   r   rT   rU   rV   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r   r!   r[   r\   r]   r"   c                      t          ||fi d|d|d|ddd|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|}  | |           S ) a!
  Creates a Transformer decoder model specification.

        Args:
          num_layers: Number of decoder layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          no_final_norm: Do not apply layer normalization after the last decoder block.
          project_in_out: Add a linear layer after the embedding layer and another one
            before the final output projection.
          with_relative_position: Enable relative position representations modules.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of head
          sliding_window: max sequence length to retain KV cache
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
        r   r   r   rP   Fr   rQ   r   r   r   rT   rU   rV   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r   r!   r[   r\   r]   r"   )rN   )!r   r   r   r   r   r   r   rQ   r   r   r   rT   rU   rV   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r   r!   r[   r\   r]   r"   r   s!                                    r-   r   z'TransformerDecoderModelSpec.from_configg  s   X )!
 !
 !
 X!
 "z	!

 !4 3!
 $)5!
 (-!
 *>!
 54!
 G!
 X!
 %!
 *F)E!
 $!
 "z!
  0/!!
" !4 3#!
$ #8"7%!
& $'!
( .N-M)!
* %<$;+!
, 0/-!
. 0//!
0 !4 31!
2 #8"73!
4 &5!
6 X7!
8 *>9!
: "z;!
< .-=!
> "z?!
@ GA!
F s7||r/   c                     dS )NrN   r)   rp   s    r-   r   z TransformerDecoderModelSpec.name      ''r/   c                     dS )N   r)   rp   s    r-   r   z$TransformerDecoderModelSpec.revision  r   r/   c                     t                      S ro   )r   rp   s    r-   r   z.TransformerDecoderModelSpec.get_default_config      ,...r/   c                 :    | j         j        j        j        d         S r   r   rp   s    r-   get_vocabulary_sizez/TransformerDecoderModelSpec.get_vocabulary_size  r   r/   )rB   rC   rD   r   rN   rA   r   r   rE   rF   rI   rJ   r   r   rK   rL   rr   r   rs   r   r   r   r   r   r   s   @r-   r   r   V  s       ==3 6 3 3 3 3 3 3 
 -8-C-H$)#$',-2!$("&JN'("01'("'"'$)&+&*"&(,9=*.$(An nn n 	n
  *n "n n n !%n n n n '+n n SMn   !n" &n&FG#n$  %%n& 'n( +.)n* "%+n,  -n.  /n0 "1n2  $3n4 sm5n6 3-7n8 !9n: [56;n< #3-=n> SM?n@ An n n [n` ( ( X(   X/ / /7 7 7 7 7 7 7r/   r   c                   6     e Zd ZdZddee         f fdZ xZS )TransformerEncoderModelConfigz-Configuration for Transformer encoder models.Nr   c                 >     t                      j        dd|i| dS )zInitializes the configuration for Transformer encoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   Nr)   r   r   s      r-   rA   z&TransformerEncoderModelConfig.__init__  r   r/   ro   r   r   s   @r-   r   r     r   r/   r   c                        e Zd ZdZdej        j        fdededej        f fdZ	e
d             Ze
d             Zd	 Zd
 Z xZS )TransformerEncoderModelSpecz2Describes a Transformer encoder model (e.g. BERT).Fr   pooling_layerpooling_activationc                 x   t          |t                    st          d          t                                                       || _        | j                            d| j        j                   |rFt          j
                    | _        t          j        d                              |          | _        dS dS )zInitializes a Transformer encoder model specification.

        Args:
          encoder: The encoder specification.
          pooling_layer: Add the pooling layer.
          pooling_activation: The activation to apply after the pooling layer.
        r   r   r'   N)r   r
   r   r   rA   r   rf   r   r   r   ri   pooler_denser5   r6   r7   pooler_activation)r@   r   r   r   r   s       r-   rA   z$TransformerEncoderModelSpec.__init__  s     '#9:: 	QOPPP""#T\%G	
 	
 	
  	O + 6 8 8D%'Xf%5%5%:%:;M%N%ND"""	O 	Or/   c                     dS )Nr
   r)   rp   s    r-   r   z TransformerEncoderModelSpec.name  r   r/   c                     dS )Nr   r)   rp   s    r-   r   z$TransformerEncoderModelSpec.revision  r   r/   c                     t                      S ro   )r   rp   s    r-   r   z.TransformerEncoderModelSpec.get_default_config  r   r/   c                 F    | j         j        d         j        j        d         S r   )r   r9   r   r   rp   s    r-   r   z/TransformerEncoderModelSpec.get_vocabulary_size  s    |&q)06q99r/   )rB   rC   rD   r   r   rE   Tanhr
   rJ   rA   rs   r   r   r   r   r   r   s   @r-   r   r     s        <<
 $5@5K5P	O O'O O (2	O O O O O O4 ( ( X(   X/ / /: : : : : : :r/   r   )r   typingr   r   r   numpyr5   ctranslate2.specsr   r   r   	LayerSpecr
   rN   r3   ra   rz   r;   SequenceToSequenceModelConfigr   SequenceToSequenceModelSpecr   LanguageModelConfigr   LanguageModelSpecr   r   r   r)   r/   r-   <module>r      s   6 6 ) ) ) ) ) ) ) ) ) )     E E E E E E E E E Eg
 g
 g
 g
 g
Z1 g
 g
 g
Ti i i i iZ1 i i iX0, 0, 0, 0, 0,*"6 0, 0, 0,fX, X, X, X, X,*"6 X, X, X,v; ; ; ; ;j* ; ; ;- - - - -*. - - -

J 
J 
J 
J 
J
@ 
J 
J 
J}7 }7 }7 }7 }7j< }7 }7 }7@
J 
J 
J 
J 
JJ$B 
J 
J 
JN7 N7 N7 N7 N7*"> N7 N7 N7b
J 
J 
J 
J 
JJ$B 
J 
J 
J): ): ): ): ):*"> ): ): ): ): ):r/   