
     `iB                       d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlmc mZ ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)  e$j*        e+          Z,d Z-d Z.d Z/dcdZ0de
j1        de
j1        fdZ2e e!d           G d de                                   Z3e e!d           G d de                                   Z4ee! G d de                                   Z5 G d d ej6                  Z7 G d! d"ej6                  Z8 G d# d$ej6                  Z9 G d% d&ej6                  Z: G d' d(ej6                  Z; G d) d*ej6                  Z< G d+ d,ej6                  Z= G d- d.ej6                  Z> G d/ d0ej6                  Z? G d1 d2e          Z@ G d3 d4ej6                  ZA G d5 d6ej6                  ZB G d7 d8ej6                  ZC G d9 d:ej6                  ZD	 	 ddd<ej6        d=e
j1        d>e
j1        d?e
j1        d@ee
j1                 dAeEdBeEdCee
j1                 fdDZF G dE dFej6                  ZG G dG dHej6                  ZH G dI dJej6                  ZI G dK dLej6                  ZJ G dM dNej6                  ZK G dO dPe          ZL G dQ dRej6                  ZM G dS dTej6                  ZNe! G dU dVe                      ZO G dW dXeO          ZP e!dY           G dZ d[eO                      ZQe! G d\ d]eO                      ZRe! G d^ d_eO                      ZSe! G d` daeO                      ZTg dbZUdS )ezPyTorch CLAP model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging	torch_int   )ClapAudioConfig
ClapConfigClapTextConfigc                     | j         \  }}}| dddddddf                             dd|d          }|                    |||z  |          }|S )ae  
    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

    Args:
        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
            Input hidden states
        ratio (`int`):
            The ratio of the length of the output to the length of the input.
    Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampleds         z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/clap/modeling_clap.pyinterpolater*   +   sg     .;-@*ZkaaaD!!!m,33Aq%CCI!!*kE.A;OOI    c                     | j         \  }}}}|                     |||z  |||z  ||          } |                     dddddd                                                              d|||          }|S )aR  
    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
    num_channels)`

    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
            Input hidden states
        window_size (`int`):
            Window size
    r   r   r	            r    viewpermute
contiguous)r#   window_sizer%   heightwidthnum_channelswindowss          r)   window_partitionr:   <   s     /<.A+J|!&&Fk);8Lk[g M ##Aq!Q155@@BBGGKYdfrssGNr+   c                     | j         d         }|                     d||z  ||z  |||          } |                     dddddd                                                              d|||          } | S )a  
    Merges windows to produce higher resolution features.
    Args:
        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
            Input windows
        window_size (`int`):
            Window size
        height (`int`):
            Height of the resized audio
        width (`int`):
            Width of the resized audio
    r0   r   r   r	   r-   r.   r/   r1   )r9   r5   r6   r7   r8   s        r)   window_reverser<   Q   sx     =$Lll2v4e{6JKYdfrssGooaAq!Q//::<<AA"feUabbGNr+   c                     |                      |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxpast_key_values_lengthmaskincremental_indicess        r)   "create_position_ids_from_input_idsrK   e   sg     <<$$((**D <!444<<TBBE[[_cc##%%33r+   logitsreturnc                     t          j        t          |           | j                  }t          j                            | |          S )Ndevice)rB   arangelenrP   r   
functionalcross_entropy)rL   labelss     r)   contrastive_lossrV   w   s6    \#f++fm<<<F=&&vv666r+   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )ClapTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedslast_hidden_state.r#   
attentions)__name__
__module____qualname____doc__rZ   r   rB   FloatTensor__annotations__r[   r#   tupler\    r+   r)   rY   rY   |   s          
 04K%+,33359x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>>>r+   rY   zT
    ClapAudio model output to mimic the output of the original implementation.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )ClapAudioModelOutputz
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        The Audio embeddings obtained by applying the projection layer to the pooler_output.
    Naudio_embedsr[   .r#   r\   )r]   r^   r_   r`   rg   r   rB   ra   rb   r[   r#   rc   r\   rd   r+   r)   rf   rf      s          
 15L(5,-44459x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>>>r+   rf   c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeed<   dZeed	<   d
ee         fdZdS )
ClapOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for audio-text similarity.
    logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
        The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
        The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapTextModel`].
    audio_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapAudioModel`].
    Nlosslogits_per_audiologits_per_textrZ   rg   text_model_outputaudio_model_outputrM   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))rm   rn   N)getattrto_tuple).0kselfs     r)   	<genexpr>z&ClapOutput.to_tuple.<locals>.<genexpr>   sc       
 
  KKKDGGQXY]_`QaQaQjQjQlQl
 
 
 
 
 
r+   )rc   keysru   s   `r)   rr   zClapOutput.to_tuple   sC     
 
 
 
YY[[
 
 
 
 
 	
r+   )r]   r^   r_   r`   rj   r   rB   ra   rb   rk   rl   rZ   rg   rm   r   rn   rc   r   rr   rd   r+   r)   ri   ri      s          & )-D(5$
%,,,48hu0188837OXe/0777/3K%+,33304L(5,-444481888592999
%* 
 
 
 
 
 
r+   ri   c                   *     e Zd ZdZd fd	Zd Z xZS )ClapDropPathz
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
    refactored version of the `SwinDropPath` implementation.
    Nc                 V    t                                                       || _        d S N)super__init__	drop_prob)ru   r   	__class__s     r)   r~   zClapDropPath.__init__   s$    "r+   c                    | j         dk    s| j        s|S d| j         z
  }|j        d         fd|j        dz
  z  z   }|t	          j        ||j        |j                  z   }|                                 |	                    |          |z  }|S )N        r   r   )r   dtyperP   )
r   trainingr    ndimrB   randr   rP   floor_div)ru   r#   	keep_probr    random_tensoroutputs         r)   forwardzClapDropPath.forward   s    >S     &	$Q')DM4F4J,KK!EJuM<OXeXl$m$m$mm""9--=r+   r|   )r]   r^   r_   r`   r~   r   __classcell__r   s   @r)   rz   rz      sV         
# # # # # #      r+   rz   c                   .     e Zd ZdZdef fdZd Z xZS )ClapAudioAFFBlockz
    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
    the 1D version.
    configc                    t                                                       |j        }|j        }t	          ||z            }t          j        t          j        ||ddd          t          j        |          t          j	        d          t          j        ||ddd          t          j        |                    | _
        t          j        t          j        d          t          j        ||ddd          t          j        |          t          j	        d          t          j        ||ddd          t          j        |                    | _        t          j                    | _        d S )Nr   r   kernel_sizestridepaddingT)inplace)r}   r~   patch_embeds_hidden_sizeaff_block_rrA   r   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)ru   r   channelsdownsize_ratiointer_channelsr   s        r)   r~   zClapAudioAFFBlock.__init__   s<   2+X788IhAaQRSSSN>**GD!!!InhAaQRSSSN8$$
 
 - ##IhAaQRSSSN>**GD!!!InhAaQRSSSN8$$
 
 z||r+   c                     ||z   }|                      |          |                     |          z   }|                     |          }d|z  |z  d|z  d|z
  z  z   }|S )Nr-   r   )r   r   r   )ru   r#   residualattention_inputfused_layer_outputr   s         r)   r   zClapAudioAFFBlock.forward   sk    '(2!^^O<<t?_?__!\\*<==]"%77!h,!N`J`:aar+   r]   r^   r_   r`   r   r~   r   r   r   s   @r)   r   r      s]         
$ $ $ $ $ $ $0      r+   r   c                   0     e Zd ZdZdef fdZddZ xZS )ClapAudioPatchEmbedz
    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
    Transformer block.
    r   c                 L   t                                                       t          |j        t                    r|j        |j        fn|j        }t          |j        t                    r|j        |j        fn|j        }t          |j        t                    r|j        |j        fn|j        }|| _        || _        |d         |d         z  |d         |d         z  f| _        | j        d         | j        d         z  | _	        |j
        | _        |j        | _        |d         |d         z
  dz  |d         |d         z
  dz  f}| j        r|j        dk    rdnd}t          j        |j        |z  |j        |||          | _        |j        rt          j        |j                  nt          j                    | _        | j        r`t/          |          | _        t          j        |j        |j        |d         |d         dz  f|d         |d         dz  f|          | _        d S d S )Nr   r   r-   channel_mapr.   r   r	   )r}   r~   
isinstance	spec_sizerA   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)ru   r   r   r   r   r   scale_factorr   s          r)   r~   zClapAudioPatchEmbed.__init__  s8   ;EfFVX[;\;\rF$f&677bhbr6@ARTW6X6XoV 122^d^o 	 ;EVEXZ]:^:^wV &"566djdw 	 !("1+a8(1+VW:XY>!,t~a/@@2#1qMLO39JqMLYZO<[`a;ab!/af6HM6Y6Yqq`aI-<+"
 
 
	 FLEcvBL!@AAAikitiviv	 	 1& 9 9D i1/']JqMA,=>$Qa1)<=  DOOO	 	r+   Nc                 2   | j         r|d d ddd d d d f         }|j        \  }}}}|| j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                     |          }|                    d          }t          |          dk    r||dd d d d d f                                         }	|	j        \  }}}}|	                    ||z  d||          }	| 	                    |	          }	|	j        \  }
}}}|	                    |||||          }	|	
                    d                                                              d	          }	|	                    d          }t          j        j                            |	d||z
  fd
d          }	|                     ||         |	          ||<   |}nu|j        \  }
}
}}|| j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                     |          }| j        r)|                    d                              dd          }|                     |          }|S )Nr   r   zInput audio size (*z) doesn't match model (z).r0   )r   r-   r	   r   r.   r	   constantr-   )r   r    r   
ValueErrorr   sizerR   r4   r2   r   r3   r   rB   r   rS   padr   	transposer   )ru   r#   is_longer_idxglobal_hidden_statesr%   r8   r6   r7   output_widthlocal_hidden_states_featureslocal_widths                r)   r   zClapAudioPatchEmbed.forward9  s
    )	5#0AaCAAA#>  7K6P3Jfeq)))UdmA6F-F-F www%wwPTP]^_P`wwcgcpqrcswww   $(99-A#B#B /44R88L=!!A%%&3M122qqq!!!4K&L&W&W&Y&Y#:M:S7
L&%&9&>&>zL?XZ[]cej&k&k#&*oo6I&J&J#-@-F*8VU&9&>&>z<Yacikp&q&q#&9&A&A/&R&R&]&]&_&_&g&ghi&j&j#166r::&+h&9&=&='!\K-G)H*VW' '# 7;6G6G(79L7 7$]3 1MM"/"5Aq&%q)))UdmA6F-F-F www%wwPTP]^_P`wwcgcpqrcswww   !IIm44M< 	E)11!44>>q!DDM		-00r+   r|   r   r   s   @r)   r   r   	  sc         
( ( ( ( ( ( (T/ / / / / / / /r+   r   c                        e Zd Z fdZ	 	 	 d
dej        deej                 deej                 dee         de	ej                 f
d	Z
 xZS )ClapAudioSelfAttentionc                    t                                                       ||z  dk    rt          d| d| d          || _        t	          ||z            | _        | j        | j        z  | _        t          |t          j	        j
                  r|n||f| _        t          j        t          j        d| j        d         z  dz
  d| j        d         z  dz
  z  |                    | _        t          j        | j        d                   }t          j        | j        d                   }t          j        t'          ||gd                    }t          j        |d          }|d d d d d f         |d d d d d f         z
  }	|	                    ddd                                          }	|	d d d d dfxx         | j        d         dz
  z  cc<   |	d d d d dfxx         | j        d         dz
  z  cc<   |	d d d d dfxx         d| j        d         z  dz
  z  cc<   |	                    d	          }
|                     d
|
           t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        |j                  | _         d S )Nr   The hidden size (6) is not a multiple of the number of attention heads ()r-   r   ij)indexingr0   relative_position_indexbias)!r}   r~   r   num_attention_headsrA   attention_head_sizeall_head_sizer   collectionsabcIterabler5   r   	ParameterrB   zerosrelative_position_bias_tablerQ   stackr   r   r3   r4   sumregister_bufferLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropout)ru   r   r?   	num_headsr5   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r   s              r)   r~   zClapAudioSelfAttention.__init__m  s   ?akCkk_hkkk   $- #&sY#7#7 !58PP%k;?3KLLlKKS^`kRl 	 -/LKT-a0014T=Ma=P9PST9TUW`aa-
 -
)
 < 0 344< 0 344Xx&:TJJJKKvq11(AAAt4~aaaqqqj7QQ)11!Q::EEGG111a   D$4Q$7!$;;   111a   D$4Q$7!$;;   111a   A(8(;$;a$??   "1"5"5b"9"968OPPPYt143EFO\\\
9T/1C&/ZZZYt143EFO\\\
z&"EFFr+   NFr#   attention_mask	head_maskoutput_attentionsrM   c                    |j         \  }}}||d| j        f}|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }t          j        |	|
                    dd                    }|t          j
        | j                  z  }| j        | j                            d                   }|                    | j        d         | j        d         z  | j        d         | j        d         z  d          }|                    ddd                                          }||                    d          z   }|v|j         d         }|                    ||z  || j        ||          }||                    d                              d          z   }|                    d| j        ||          }t$          j                            |d          }|                     |          }|||z  }t          j        ||          }|                    dddd                                          }|                                d d         | j        fz   }|                    |          }|r||fn|f}|S )Nr0   r   r-   r   r>   r	   )r    r   r   r2   r   r   r   rB   matmulmathsqrtr   r   r5   r3   r4   	unsqueezer   r   rS   softmaxr   r   r   )ru   r#   r   r   r   r%   r?   r8   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r)   r   zClapAudioSelfAttention.forward  s    )6(;%
C"CT-EFjj//44\BBLLQPQRRHH]++00>>HHANN	jj//44\BBLLQPQRR !<Y5H5HR5P5PQQ+di8P.Q.QQ!%!B4C_CdCdegChCh!i!7!<!<Q$"21"55t7G7JTM]^_M`7`bd"
 "
 "8!?!?1a!H!H!S!S!U!U+.D.N.Nq.Q.QQ%'-a0J/44j(*d6NPSUX     0.2J2J12M2M2W2WXY2Z2ZZ/44R9QSVX[\\ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]r+   NNFr]   r^   r_   r~   rB   Tensorr   ra   boolrc   r   r   r   s   @r)   r   r   l  s        #G #G #G #G #GP 7;15,16 6|6 !!236 E-.	6
 $D>6 
u|	6 6 6 6 6 6 6 6r+   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )ClapAudioSelfOutputc                     t                                                       t          j        ||          | _        t          j        |j                  | _        d S r|   )r}   r~   r   r   denser   r   r   ru   r   r?   r   s      r)   r~   zClapAudioSelfOutput.__init__  sD    YsC((
z&"EFFr+   r#   input_tensorrM   c                 Z    |                      |          }|                     |          }|S r|   r  r   ru   r#   r  s      r)   r   zClapAudioSelfOutput.forward  s*    

=11]33r+   r]   r^   r_   r~   rB   r  r   r   r   s   @r)   r  r    sn        G G G G G
U\  RWR^        r+   r  c                        e Zd Z fdZd Z	 	 	 ddej        deej                 deej                 dee	         d	e
ej                 f
d
Z xZS )ClapAudioAttentionc                     t                                                       t          ||||          | _        t	          ||          | _        t                      | _        d S r|   )r}   r~   r   ru   r  r   setpruned_heads)ru   r   r?   r   r5   r   s        r)   r~   zClapAudioAttention.__init__  sQ    *63	;OO	)&#66EEr+   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S Nr   r   r>   rR   r   ru   r   r   r  r   r   r   r   r   r  r   unionru   headsindexs      r)   prune_headszClapAudioAttention.prune_heads      u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r+   NFr#   r   r   r   rM   c                     |                      ||||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   ru   r   )ru   r#   r   r   r   self_outputsattention_outputr  s           r)   r   zClapAudioAttention.forward  sO     yy	K\]];;|AFF#%QRR(88r+   r  r]   r^   r_   r~   r&  rB   r  r   ra   r  rc   r   r   r   s   @r)   r  r    s        " " " " "; ; ;* 7;15,1
 
|
 !!23
 E-.	

 $D>
 
u|	
 
 
 
 
 
 
 
r+   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )ClapAudioIntermediatec                 $   t                                                       t          j        |t	          |j        |z                      | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r|   )r}   r~   r   r   rA   	mlp_ratior  r   
hidden_actstrr
   intermediate_act_fnr  s      r)   r~   zClapAudioIntermediate.__init__  sx    YsC(83(>$?$?@@
f'-- 	9'-f.?'@D$$$'-'8D$$$r+   r#   rM   c                 Z    |                      |          }|                     |          }|S r|   r  r3  ru   r#   s     r)   r   zClapAudioIntermediate.forward
  ,    

=1100??r+   r  r   s   @r)   r.  r.    ^        9 9 9 9 9U\ el        r+   r.  c                   B     e Zd Z fdZdej        dej        fdZ xZS )ClapAudioOutputc                     t                                                       t          j        t	          |j        |z            |          | _        t          j        |j                  | _	        d S r|   )
r}   r~   r   r   rA   r0  r  r   hidden_dropout_probr   r  s      r)   r~   zClapAudioOutput.__init__  sT    Ys6#3c#9::C@@
z&"<==r+   r#   rM   c                 Z    |                      |          }|                     |          }|S r|   r  r6  s     r)   r   zClapAudioOutput.forward  s*    

=11]33r+   r  r   s   @r)   r:  r:    s^        > > > > >
U\ el        r+   r:  c                        e Zd Zd fd	Zd Zd Zd Z	 	 	 dd	ej        d
e	e
e
f         deej                 dee         dee         de	ej        ej        f         fdZ xZS )ClapAudioLayerr   r   c                    t                                                       |j        | _        || _        |j        | _        || _        t          j        ||j                  | _	        t          |||| j                  | _        |dk    rt          |          nt          j                    | _        t          j        ||j                  | _        t!          ||          | _        t%          ||          | _        d S )Neps)r5   r   )r}   r~   chunk_size_feed_forward
shift_sizer5   input_resolutionr   r   layer_norm_epslayernorm_beforer  	attentionrz   r   	drop_pathlayernorm_afterr.  intermediater:  r   )ru   r   r?   rE  r   drop_path_raterD  r   s          r)   r~   zClapAudioLayer.__init__  s    '-'E$$!- 0 "Sf6K L L L+FCPTP`aaa9G#9M9Mn555SUS^S`S`!|CV5JKKK1&#>>%fc22r+   c                    t          |          | j        k    rnt          d          | _        t          j                                        r&t	          j         t	          j        |                    nt          |          | _        d S d S Nr   )minr5   r   rD  rB   jit
is_tracingtensor)ru   rE  s     r)   set_shift_and_window_sizez(ClapAudioLayer.set_shift_and_window_size,  sv      D$444'llDO=BY=Q=Q=S=Sn	%,'788999Y\]mYnYn  54r+   c           	         | j         dk    r]t          j        d||df||          }t          d| j                   t          | j         | j                    t          | j          d           f}t          d| j                   t          | j         | j                    t          | j          d           f}d}|D ]}	|D ]}
||d d |	|
d d f<   |dz  }t          || j                  }|                    d| j        | j        z            }|                    d          |                    d          z
  }|                    |dk    d                              |dk    d          }nd }|S )Nr   r   r   r0   r-   g      Yr   )	rD  rB   r   slicer5   r:   r2   r   masked_fill)ru   r6   r7   r   rP   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r)   get_attn_maskzClapAudioLayer.get_attn_mask4  s   ?Q{Avua#8fUUUHa$**++t''$/)9::t&--M a$**++t''$/)9::t&--L
 E -  #/  K@EHQQQk111<=QJEE ,Hd6FGGL',,R1ADDT1TUUL$..q11L4J4J14M4MMI!--i1nfEEQQR[_`R`beffIIIr+   c                     | j         || j         z  z
  | j         z  }| j         || j         z  z
  | j         z  }ddd|d|f}t          j                            ||          }||fS rN  )r5   r   rS   r   )ru   r#   r6   r7   	pad_right
pad_bottom
pad_valuess          r)   	maybe_padzClapAudioLayer.maybe_padP  sp    %0@(@@DDTT	&$2B)BBdFVV
Ay!Z8
))-DDj((r+   NFr#   input_dimensionsr   r   always_partitionrM   c                    |s|                      |           n	 |\  }}|                                \  }}	}
|}|                     |          }|                    ||||
          }|                     |||          \  }}|j        \  }	}}}	| j        dk    r&t          j        || j         | j         fd          }n|}t          || j
                  }|                    d| j
        | j
        z  |
          }|                     |||j        |j                  }|                     ||||          }|d         }|                    d| j
        | j
        |
          }t          || j
        ||          }| j        dk    r$t          j        || j        | j        fd          }n|}|d         dk    p|d         dk    }|r&|d d d |d |d d f                                         }|                    |||z  |
          }||                     |          z   }|                     |          }|                     |          }||                     |          z   }|r
||d	         fn|f}|S )
Nr   )r   r-   )shiftsdimsr0   r   )r   r	   r/   r   )rS  r   rG  r2   rd  r    rD  rB   rollr:   r5   r_  r   rP   rH  r<   r4   rI  rJ  rK  r   )ru   r#   re  r   r   rf  r6   r7   r%   r   r   shortcutrc  
height_pad	width_padshifted_hidden_stateshidden_states_windowsr^  attention_outputsr+  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                            r)   r   zClapAudioLayer.forwardW  s      	**+;<<<<("/"4"4"6"6
Ax --m<<%**:vuhOO %)NN=&%$P$P!z&3&9#:y!?Q$)J}tFVY]YhXhEipv$w$w$w!!$1! !11FHX Y Y 5 : :2t?ORVRb?bdl m m&&	)<EZEa ' 
 
	 !NN!9iK\ + 
 
 -Q/,11"d6FHXZbcc():D<LjZcdd ?Q %
?DOUYUdCelr s s s /]Q&;*Q-!*;
 	V 1!!!WfWfufaaa2G H S S U U-22:v~xXX 4>>2C#D#DD++M::((66$t{{<'@'@@@Qf'8';<<XdWfr+   )r   r   NFF)r]   r^   r_   r~   rS  r_  rd  rB   r  rc   rA   r   ra   r  r   r   r   s   @r)   r?  r?    s        3 3 3 3 3 3    8) ) ) 26,1+0A A|A  S/A E-.	A
 $D>A #4.A 
u|U\)	*A A A A A A A Ar+   r?  c                        e Zd Z fdZ	 	 	 ddej        deeef         deej	                 dee
         dee
         d	eej                 fd
Z xZS )ClapAudioStagec                 6   t                                                       | _        | _        t	          j        fdt          |          D                       | _        | |t          j                  | _	        nd | _	        d| _
        d S )Nc                 l    g | ]0}t          |         |d z  dk    rdn	j        d z            1S )r-   r   )r   r?   rE  r   rL  rD  )r?  r5   )rs   ir   r?   rI  rE  r   s     r)   
<listcomp>z+ClapAudioStage.__init__.<locals>.<listcomp>  sh     
 
 
  !%5'#,Q<%&UaZZqqf6HA6M  
 
 
r+   )r?   
norm_layerF)r}   r~   r   r?   r   
ModuleListrangeblocksr   
downsamplepointing)	ru   r   r?   rE  depthr   rI  r  r   s	    ``` `` r)   r~   zClapAudioStage.__init__  s    m
 
 
 
 
 
 
 
 u
 
 

 
 !(j)9sr|\\\DOO"DOr+   NFr#   re  r   r   rf  rM   c                 *   |\  }}t          | j                  D ](\  }}	|||         nd }
 |	|||
||          }|d         })|}| j        -|dz   dz  |dz   dz  }}||||f}|                     ||          }n||||f}|||f}|r||dd          z  }|S )Nr   r   r-   )	enumerater  r  )ru   r#   re  r   r   rf  r6   r7   r{  layer_modulelayer_head_maskru  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputss                    r)   r   zClapAudioStage.forward  s     )(55 	- 	-OA|.7.CillO(L/BSUe M *!,MM,9)?&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_``MM!' >&(IK\] 	/]122..Mr+   rv  )r]   r^   r_   r~   rB   r  rc   rA   r   ra   r  r   r   r   s   @r)   rx  rx    s            < 26,1+0 |  S/ E-.	
 $D> #4. 
u|	       r+   rx  c            	            e Zd ZdZej        fdee         dedej        ddf fdZ	d Z
d	ej        d
eeef         dej        fdZ xZS )ClapAudioPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    rE  r?   r}  rM   Nc                     t                                                       || _        || _        t	          j        d|z  d|z  d          | _         |d|z            | _        d S )Nr.   r-   Fr   )r}   r~   rE  r?   r   r   	reductionr   )ru   rE  r?   r}  r   s       r)   r~   zClapAudioPatchMerging.__init__  sa     01s7AG%@@@Jq3w''			r+   c                     |dz  dk    p|dz  dk    }|r.ddd|dz  d|dz  f}t           j                            ||          }|S )Nr-   r   r   )r   rS   r   )ru   input_featurer6   r7   
should_padrc  s         r)   rd  zClapAudioPatchMerging.maybe_pad  s\    qjAo:519>
 	IQ519a!<JM--mZHHMr+   r  re  c                    |\  }}|j         \  }}}|                    ||||          }|                     |||          }|d d dd ddd dd d f         }|d d dd ddd dd d f         }	|d d dd ddd dd d f         }
|d d dd ddd dd d f         }t          j        ||	|
|gd          }|                    |dd|z            }|                     |          }|                     |          }|S )Nr   r-   r   r0   r.   )r    r2   rd  rB   catr   r  )ru   r  re  r6   r7   r%   r?   r8   input_feature_0input_feature_1input_feature_2input_feature_3s               r)   r   zClapAudioPatchMerging.forward  sD   ((5(;%
C%**:vulSS}feDD'14a4Aqqq(89'14a4Aqqq(89'14a4Aqqq(89'14a4Aqqq(89	?O_Ve"fhjkk%**:r1|;KLL		-00}55r+   )r]   r^   r_   r`   r   r   rc   rA   Moduler~   rd  rB   r  r   r   r   s   @r)   r  r    s        
 
 XZWc ( (s (# (29 (hl ( ( ( ( ( (  U\ U3PS8_ Y^Ye        r+   r  c                        e Zd Z fdZd Z	 	 	 	 	 	 	 ddeej                 deej                 dee         d	ee         d
ee         dee         dee         de	e
ef         fdZ xZS )ClapAudioEncoderc                     t                                                       t          j                   _         _        t                     _        j         _         j        j	         _	        j
         _
        j
        j        z   _        t          j        d j        dz
  z  z             _        d t!          j        dj        t'          j                  d          D              j        j        fdt+           j                  D              _        t/          j         fdt+           j                  D                        _        d	 _        t/          j        j                   _        t/          j         j                   _        j         _        t/          j        d           _         d S )
Nr-   r   c                 6    g | ]}|                                 S rd   )item)rs   xs     r)   r|  z-ClapAudioEncoder.__init__.<locals>.<listcomp>  s     wwwq!&&((wwwr+   r   cpurO   c                 H    g | ]}d          d|z  z  d         d|z  z  fS )r   r-   r   rd   )rs   r{  r   s     r)   r|  z-ClapAudioEncoder.__init__.<locals>.<listcomp>   s9    !s!s!sWX9Q<AqD#99Q<AqD;Q"R!s!s!sr+   c                 V   g | ]}t          t          j        d |z  z            j        |         j        |         j        |         t          j        d|                   t          j        d|dz                               |j        dz
  k     rt          nd          S )r-   Nr   )r   r?   rE  r  r   rI  r  )	rx  rA   r   input_resolutionsdepthsr   r   
num_layersr  )rs   i_layerr   rL  ru   s     r)   r|  z-ClapAudioEncoder.__init__.<locals>.<listcomp>#  s         !F;ajHII%)%;G%D -0$8A,Sxx1H-I-ICPVP]^k`gjk`k^kPlLmLm-mn9@4?UVCV9V9V44]a    r+   F)!r}   r~   rR   r  r  r   r   patch_embedr   r   r   num_mel_bins
freq_ratiorA   r   num_featuresrB   linspacerL  r   r   r  r  r   r~  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpool)ru   r   rL  r   r   s   ``@@r)   r~   zClapAudioEncoder.__init__  s   fm,,.v66#1 ,9) *f.AA ?!Z[H[B\ \]]wwEN1f>SUXY_YfUgUgpu,v,v,vwww$.	!s!s!s!s\abfbq\r\r!s!s!sm       %T_55  
 
 ',#.)<==L!233	m+A..r+   c                 b   |j         \  }}}}t          | j        | j        z            }| j        | j        z  }||k    s||k    rt	          d          ||k     r%t
          j                            |||fdd          }||k     r%t
          j                            |||fdd          }|j         \  }}}	}
|                    ||| j        z  |	| j        z  |
          }|	                    dddd          
                                }|                    |||
| j        z  |	| j        z            }|S )	z
        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
        z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r	   r-   )r    rA   r   r  r   r   rS   r*   r"   r3   r4   )ru   normalized_input_featuresr   r&   freq_length
spec_widthspec_heightbatchr   timefreqs              r)   reshape_mel2imgz ClapAudioEncoder.reshape_mel2img8  sr   
 *C)H&1k;$/9::
n7##{['@'@_``` ##(*(A(A)J+D9dh )B ) )% $$(*(A(A)K+EIei )B ) )% '@&E#xt %>$E$E8do-tt/F%
 %
! %>$E$EaAq$Q$Q$\$\$^$^!$=$E$E8TDO3TT_5L%
 %
! )(r+   NFT	is_longerr   r   output_hidden_states(output_hidden_states_before_downsamplingrf  return_dictrM   c	                    |                     dd          }|                     |          }	|	                     dd          }	d }
| j        r8|                    |j                  }t          j        |dk              d         }
|                     |	          }|j        d         }| 	                    ||
          }|rdnd }|rdnd }|rdnd }| j
        d         }|r?|j        \  }}} |j        |g||R  }|                    dddd          }||fz  }||fz  }t          | j                  D ]\  }}|||         nd }| j
        |         } ||||||          }|d         }|d         }|d         }|d         |d         f}|rP|rN|j        \  }}} |j        |g|d         |d         f|R  }|                    dddd          }||fz  }||fz  }nC|rA|s?|j        \  }}} |j        |g||R  }|                    dddd          }||fz  }||fz  }|r||dd          z  }|                     |          }|j        \  }}}|dt!          | j                  dz
  z  z  | j        d         z  }|dt!          | j                  dz
  z  z  | j        d         z  }|                    ddd                                                              ||||          }|j        \  }}} }!| | j        z  }"|                    ||| |"z  |"|!          }|                    ddddd                                                              |||"d          }|                     t          j        |d                    }#t          j        |#d          }#|st1          d	 ||#||fD                       S t3          ||#||
          S )Nr   r	   r   r-   rd   r   r0   r.   c              3      K   | ]}||V  	d S r|   rd   )rs   vs     r)   rv   z+ClapAudioEncoder.forward.<locals>.<genexpr>  s4       	 	 =  !===	 	r+   r[   pooler_outputr#   r\   )r   r  r   torP   rB   wherer  r    r  r  r2   r3   r  r  r   rR   r  r   r4   r"   r  r  r   rc   r   )$ru   input_featuresr  r   r   r  r  rf  r  r  is_longer_list_idxis_longer_listr#   
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsre  r%   r   hidden_sizereshaped_hidden_stater{  r  r  ru  r  r  r[   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputs$                                       r)   r   zClapAudioEncoder.forward\  s    (11!Q77$(OON$C$C!$=$G$G1$M$M!! 	E&\\.*?@@N!&^q-@!A!A!!D,,-FGG"(+
((8JKK"6@BBD+?%IRRT"$5?bb41!4 	C)6)<&J;$6M$6z$bDT$bVa$b$b$b!$9$A$A!Q1$M$M!-!11&+@*BB&(55 #	9 #	9OA|.7.CillO#5a8(L/BSUe M *!,M0=a0@- -a 0 1" 57H7LM# G(P G-N-T*
A{ )O(I(N)"3A"68I!8L!M)OZ) ) )% )>(E(EaAq(Q(Q%!&G%II!*/D.FF**% G.V G-:-@*
A{(:(::(fHX(fZe(f(f(f%(=(E(EaAq(Q(Q%!m%55!*/D.FF*  9#}QRR'88# IIm44$5$;!
AzA#dk*:*:Q*>$?@DDUVWDXX
#c$+.>.>.B(CDHYZ[H\\ %%aA..99;;CCJPZ\fhvww 	 9J8O5
Jv"do5
-55
MZ$?V
 
 %%aAq!44??AAII*V`blnpqq 	 U]3Da%H%HIImQ77 
	 	 	 &!.'		 	 	 	 	 	 */'4*	
 
 
 	
r+   )NNFFFFT)r]   r^   r_   r~   r  r   rB   ra   r  r   rc   rf   r   r   r   s   @r)   r  r    s	       &/ &/ &/ &/ &/P") ") ")N 2615,1/4CH+0&*u
 u
 E-.u
 E-.	u

 $D>u
 'tnu
 3;4.u
 #4.u
 d^u
 
u**	+u
 u
 u
 u
 u
 u
 u
 u
r+   r  c                   :     e Zd Zdeeef         f fdZd Z xZS )ClapProjectionLayerr   c                    t                                                       || _        |j        }|j        }t          j        ||          | _        t          |j	                 | _
        t          j        ||          | _        d S r|   )r}   r~   r   r  projection_dimr   r   linear1r
   projection_hidden_act
activationlinear2)ru   r   r  r  r   s       r)   r~   zClapProjectionLayer.__init__  si    (.yn== !=>y@@r+   c                     |                      |          }|                     |          }|                     |          }|S r|   )r  r  r  r6  s     r)   r   zClapProjectionLayer.forward  s;    ]3366]33r+   )	r]   r^   r_   r   r   r   r~   r   r   r   s   @r)   r  r    sd        Au_n%DE A A A A A A      r+   r  c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )ClapTextEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                 l   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t#          |dd          | _        |                     dt)          j        |j                                      d          d           |                     d	t)          j        | j                                        t(          j        
          d           |j        | _        t          j        |j        |j        | j                  | _	        d S )N)rG   rA  position_embedding_typeabsoluteposition_ids)r   r0   T)
persistenttoken_type_ids)r   )r}   r~   r   	Embedding
vocab_sizer  pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr   rF  r   r<  r   rq   r  r   rB   rQ   expandr   r  r   rE   rG   ru   r   r   s     r)   r~   zClapTextEmbeddings.__init__  s}   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXei 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbf 	 	
 	
 	

 ".#%<*F,>DL\$
 $
 $
   r+   Nr   c                    |.|t          || j        |          }n|                     |          }||                                }n|                                d d         }|d         }|mt	          | d          r2| j        d d d |f         }|                    |d         |          }	|	}n+t          j        |t          j	        | j
        j                  }||                     |          }|                     |          }
||
z   }| j        dk    r|                     |          }||z  }|                     |          }|                     |          }|S )Nr0   r   r  r   r   r  )rK   rG   &create_position_ids_from_inputs_embedsr   hasattrr  r  rB   r   rE   r  rP   r  r  r  r  r   r   )ru   rF   r  r  inputs_embedsrH   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  
embeddingsr  s                r)   r   zClapTextEmbeddings.forward  s{    $A)TM]_uvv#JJ=YY #..**KK',,..ss3K ^

 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r+   c                    |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |          S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr0   r   r   r   )r   rB   rQ   rG   rE   rP   r   r  )ru   r  r  sequence_lengthr  s        r)   r  z9ClapTextEmbeddings.create_position_ids_from_inputs_embeds.  s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<<r+   )NNNNr   )r]   r^   r_   r`   r~   r   r  r   r   s   @r)   r  r    sm         

 
 
 
 
4 rs& & & &P= = = = = = =r+   r  r   moduler   r   r   r   scalingr   r   c                 8   t          j        ||                    dd                    |z  }	|$|d d d d d d d |j        d         f         }
|	|
z   }	t          j                            |	dt           j                                      |j	                  }	t          j        
                    |	|| j                  }	||	|                    dddd          z  }	t          j        |	|          }|                    dd                                          }||	fS )Nr-   r	   r   r0   )r?   r   )pr   r   )rB   r   r   r    r   rS   r   float32r  r   r   r   r2   r4   )r  r   r   r   r   r   r   r   kwargsattn_weightscausal_maskattn_outputs               r)   eager_attention_forwardr  A  s    <s}}Q':':;;gEL!$QQQ111o	"o%=>#k1=((2U](SSVVW\WbccL=((6?([[L#innQAq&A&AA,|U33K''1--88::K$$r+   c                        e Zd Z fdZ	 	 	 d
dej        deej                 deej                 dee         de	ej                 f
d	Z
 xZS )ClapTextSelfAttentionc                    t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          || _        |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        t          j        |j        | j	                  | _        t          j        |j        | j	                  | _        t          j        |j        | j	                  | _        t          j        |j                  | _        |j        | _        | j        dz  | _        d S )Nr   embedding_sizer   r   r         )r}   r~   r  r   r  r   r   rA   r   r   r   r   r   r   r   r   r   r   attention_dropoutr   r  s     r)   r~   zClapTextSelfAttention.__init__^  s:    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 #)#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF!'!D/5r+   NFr#   r   r   r   rM   c                    |j         d d         }g |d| j        R }|                     |                              |                              dd          }|                     |                              |                              dd          }	|                     |                              |                              dd          }
t          }| j        j	        dk    rt          | j        j	                 } || ||	|
|f| j        sdn| j        | j        |d|\  }} |j        g |dR                                  }|r||fn|f}|S )Nr0   r   r-   eagerr   )r   r   r   )r    r   r   r2   r   r   r   r  r   _attn_implementationr   r   r  r   r"   r4   )ru   r#   r   r   r   r  r  r  query_states
key_statesvalue_statesattention_interfacer  r  r  s                  r)   r   zClapTextSelfAttention.forwards  s    $)#2#.CCbC$*BCCzz-0055lCCMMaQRSSXXm,,11,??II!QOO
zz-0055lCCMMaQRSS(?;+w66"9$+:Z"[$7$7
%
  $}HCC$2HL
%
 
%
 
%
 
%
!\ *k);;;;;;FFHH1BV;--r+   r  r  r   s   @r)   r
  r
  ]  s        6 6 6 6 60 7;15,1! !|! !!23! E-.	!
 $D>! 
u|	! ! ! ! ! ! ! !r+   r
  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )ClapTextSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S NrA  )r}   r~   r   r   r  r  r   rF  r   r<  r   r  s     r)   r~   zClapTextSelfOutput.__init__  sf    Yv163EFF
f&8f>STTTz&"<==r+   r#   r  rM   c                     |                      |          }|                     |          }|                     ||z             }|S r|   r  r   r   r  s      r)   r   zClapTextSelfOutput.forward  @    

=11]33}|'CDDr+   r  r   s   @r)   r  r    i        > > > > >U\  RWR^        r+   r  c                        e Zd Z fdZd Z	 	 	 ddej        deej                 deej                 dee	         d	e
ej                 f
d
Z xZS )ClapTextAttentionc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r|   )r}   r~   r
  ru   r  r   r  r  r  s     r)   r~   zClapTextAttention.__init__  sI    )&11	(00EEr+   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S r   r!  r#  s      r)   r&  zClapTextAttention.prune_heads  r'  r+   NFr#   r   r   r   rM   c                 ~     | j         |f|||d|}|                     |d         |          }|f|dd          z   }|S N)r   r   r   r   r   r)  )	ru   r#   r   r   r   r  r*  r+  r  s	            r)   r   zClapTextAttention.forward  sl     !ty
)/	
 

 
 
  ;;|AFF#%QRR(88r+   r  r,  r   s   @r)   r  r    s        " " " " "; ; ;* 7;15,1 | !!23 E-.	
 $D> 
u|	       r+   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )ClapTextIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r|   )r}   r~   r   r   r  intermediate_sizer  r   r1  r2  r
   r3  r  s     r)   r~   zClapTextIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r+   r#   rM   c                 Z    |                      |          }|                     |          }|S r|   r5  r6  s     r)   r   zClapTextIntermediate.forward  r7  r+   r  r   s   @r)   r%  r%    r8  r+   r%  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )ClapTextOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r  )r}   r~   r   r   r'  r  r  r   rF  r   r<  r   r  s     r)   r~   zClapTextOutput.__init__  sf    Yv79KLL
f&8f>STTTz&"<==r+   r#   r  rM   c                     |                      |          }|                     |          }|                     ||z             }|S r|   r  r  s      r)   r   zClapTextOutput.forward  r  r+   r  r   s   @r)   r*  r*    r  r+   r*  c                        e Zd Z fdZ	 	 	 ddej        deej                 deej                 dee         de	ej                 f
d	Z
d
 Z xZS )ClapTextLayerc                     t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        d S )Nr   )
r}   r~   rC  seq_len_dimr  rH  r%  rK  r*  r   r  s     r)   r~   zClapTextLayer.__init__  s^    '-'E$*622088$V,,r+   NFr#   r   r   r   rM   c                      | j         |f|||d|}|d         }|dd          }t          | j        | j        | j        |          }	|	f|z   }|S r#  )rH  r   feed_forward_chunkrC  r0  )
ru   r#   r   r   r   r  self_attention_outputsr+  r  rt  s
             r)   r   zClapTextLayer.forward  s     "0"
)/	"
 "

 "
 "
 2!4(,0#T%A4CSUe
 
  /G+r+   c                 \    |                      |          }|                     ||          }|S r|   )rK  r   )ru   r+  intermediate_outputrt  s       r)   r2  z ClapTextLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr+   r  )r]   r^   r_   r~   rB   r  r   ra   r  rc   r   r2  r   r   s   @r)   r.  r.    s        - - - - - 7;15,1 | !!23 E-.	
 $D> 
u|	   2      r+   r.  c                        e Zd Z fdZe	 	 	 	 	 ddej        deej                 deej                 dee	         d	ee	         d
ee	         de
eej                 ef         fd            Z xZS )ClapTextEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rd   )r.  )rs   r{  r   s     r)   r|  z,ClapTextEncoder.__init__.<locals>.<listcomp>!  s!    #c#c#caM&$9$9#c#c#cr+   F)	r}   r~   r   r   r~  r  num_hidden_layerslayerr  r  s    `r)   r~   zClapTextEncoder.__init__  s`    ]#c#c#c#c5IaCbCb#c#c#cdd
&+###r+   NFTr#   r   r   r   r  r  rM   c           	          |rdnd }|rdnd }	t          | j                  D ]<\  }
}|r||fz   }|||
         nd } |d||||d|}|d         }|r|	|d         fz   }	=|r||fz   }t          |||	          S )Nrd   )r#   r   r   r   r   r   )r[   r#   r\   )r  r;  r   )ru   r#   r   r   r   r  r  r  r  r  r{  r  r  ru  s                 r)   r   zClapTextEncoder.forward$  s     #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO(L +-)"3	 
  M *!,M  P&9]1=M<O&O# 	E 1]4D D++*
 
 
 	
r+   )NNFFT)r]   r^   r_   r~   r   rB   r  r   ra   r  r   rc   r   r   r   r   s   @r)   r7  r7    s        , , , , ,  7;15,1/4&*&
 &
|&
 !!23&
 E-.	&

 $D>&
 'tn&
 d^&
 
uU\"O3	4&
 &
 &
 &
 &
 &
 &
 &
r+   r7  c                   B     e Zd Z fdZdej        dej        fdZ xZS )ClapTextPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r|   )r}   r~   r   r   r  r  Tanhr  r  s     r)   r~   zClapTextPooler.__init__P  sC    Yv163EFF
'))r+   r#   rM   c                 r    |d d df         }|                      |          }|                     |          }|S rN  )r  r  )ru   r#   first_token_tensorpooled_outputs       r)   r   zClapTextPooler.forwardU  s@     +111a40

#56666r+   r  r   s   @r)   r>  r>  O  s^        $ $ $ $ $
U\ el        r+   r>  c                   8    e Zd ZU eed<   dZdZdej        fdZ	dS )ClapPreTrainedModelr   clapFr  c                    | j         j        }t          |t                    rT|j        j        j                            d|dz             |j        j        j                            d|dz             dS t          |t                    rx|j
        j                            t          j        | j         j                             |j        j                            t          j        | j         j                             dS t          |t           j                  r&|j        j                            d|dz             dS t          |t           j        t           j        f          r?|j        j                                         |j        j                            d           dS t          |t           j        t           j        f          rt| j         j        dz  d| j         j        z  dz  z  |z  }t           j                            |j        |           |j         |j        j                                         dS dS t          |t6                    r |j        j                                         dS dS )	zInitialize the weightsr   g{Gz?)meanstdg      ?r  r-   )rI  N)r   initializer_factorr   r  r  weightdatanormal_r  	ClapModellogit_scale_afill_r   loglogit_scale_init_valuelogit_scale_tr   r  r   r   r   zero_r   r   r  r:  initr   r   )ru   r  factorin_proj_stds       r)   _init_weightsz!ClapPreTrainedModel._init_weightsd  s1   /f011 	=&-2::RV:WWW(/4<<#6TX=<YYYYY	** 	= %++DHT[5W,X,XYYY %++DHT[5W,X,XYYYYY-- 	=M&&CVd]&CCCCCr~ >?? 		=K""$$$M$$S)))))BI 677 	=;2D8a$+B_>_dh=hilrrKGOOFM{O;;;{& &&((((( '& 677 	=/4::<<<<<	= 	=r+   N)
r]   r^   r_   r   rb   base_model_prefixsupports_gradient_checkpointingr   r  rX  rd   r+   r)   rE  rE  ^  sK         &+#=BI = = = = = =r+   rE  c                        e Zd ZU eed<   dZdef fdZdej        fdZ	e
	 	 	 	 	 ddeej                 deej                 dee         d	ee         d
ee         deeef         fd            Z xZS )ClapAudioModelr   r  c                     t                                          |           t          |          | _        |                                  d S r|   )r}   r~   r  audio_encoder	post_initr  s     r)   r~   zClapAudioModel.__init__  sA       -f55r+   rM   c                 $    | j         j        j        S r|   )r^  r  r   rx   s    r)   get_input_embeddingsz#ClapAudioModel.get_input_embeddings  s    !-22r+   Nr  r   r  r  c                     ||n| j         j        }||n| j         j        }||n| j         j        }|                     |||||          S )ae  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapAudioModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```Nr  r  r   r  r  )r   use_return_dictr   r  r^  )ru   r  r  r   r  r  s         r)   r   zClapAudioModel.forward  su    > &1%<kk$+B]1B1N--TXT_Tq$8$D  $+Jj 	 !!)/!5# " 
 
 	
r+   NNNNN)r]   r^   r_   r   rb   main_input_namer~   r   r  ra  r   r   rB   ra   
BoolTensorr  r   rc   r   r   r   r   s   @r)   r\  r\  |  s        &O      3bi 3 3 3 3  7;04,0/3&**
 *
 !23*
 E,-*
 $D>	*

 'tn*
 d^*
 
u00	1*
 *
 *
 ^*
 *
 *
 *
 *
r+   r\  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                   j    e Zd ZU eed<   d fd	Zd Zd Zee		 	 	 	 	 	 	 	 	 dde
ej                 de
ej                 d	e
ej                 d
e
ej                 de
ej                 de
ej                 de
e         de
e         de
e         deeej                 ef         fd                        Z xZS )ClapTextModelr   Tc                     t                                          |           || _        t          |          | _        t          |          | _        |rt          |          nd| _        | 	                                 dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r}   r~   r   r  r  r7  encoderr>  poolerr_  )ru   r   add_pooling_layerr   s      r)   r~   zClapTextModel.__init__  ss    
 	   ,V44&v..0AKnV,,,t 	r+   c                     | j         j        S r|   r  r  rx   s    r)   ra  z"ClapTextModel.get_input_embeddings  s    ..r+   c                     || j         _        d S r|   ro  ru   r   s     r)   set_input_embeddingsz"ClapTextModel.set_input_embeddings  s    */'''r+   NrF   r   r  r  r   r  r   r  r  rM   c
                    ||n| j         j        }||n| j         j        }|	|	n| j         j        }	||t	          d          |+|                     ||           |                                }
n.||                                d d         }
nt	          d          |
\  }}||j        n|j        }|t          j	        ||f|          }|gt          | j        d          r1| j        j        d d d |f         }|                    ||          }|}n!t          j        |
t          j        |          }|                     ||
          }|                     || j         j                  }|                     ||||          }|                     |||||d	          }|d
         }| j        |                     |          nd }t+          |||j        |j                  S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer0   z5You have to specify either input_ids or inputs_embedsrO   r  r   )rF   r  r  r  T)r   r   r   r  r  r   r  )r   r   r  rd  r   %warn_if_padding_and_no_attention_maskr   rP   rB   onesr  r  r  r  r   rE   get_extended_attention_maskget_head_maskr:  rk  rl  r   r#   r\   )ru   rF   r   r  r  r   r  r   r  r  r  r%   r  rP   r  r  extended_attention_maskembedding_outputencoder_outputssequence_outputrC  s                        r)   r   zClapTextModel.forward  sJ    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"Z*j)A6RRRN!t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z 150P0PQ_al0m0m &&y$+2OPP	??%)'	 + 
 
 ,,2/!5 ' 
 
 *!,8<8OO444UY)-')7&1	
 
 
 	
r+   )T	NNNNNNNNN)r]   r^   r_   r   rb   r~   ra  rr  r   r   r   rB   r  r  r   rc   r   r   r   r   s   @r)   ri  ri    su               / / /0 0 0  -11515/3,004,0/3&*G
 G
EL)G
 !.G
 !.	G

 u|,G
 EL)G
  -G
 $D>G
 'tnG
 d^G
 
uU\"$PP	QG
 G
 G
 ^ G
 G
 G
 G
 G
r+   ri  c                   ^    e Zd ZU eed<   def fdZ e            e	 	 ddej	        de
ej	                 de
ej	                 dej        fd                        Z e            e	 	 dd	ej	        d
e
ej	                 de
ej	                 dej        fd                        Zee	 	 	 	 	 	 	 	 	 dde
ej                 d	e
ej                 d
e
ej                 de
ej	                 de
ej                 de
e         de
e         de
e         de
e         deeef         fd                        Z xZS )rN  r   c                 J   t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          |j        }|j        }t          j
        t          j        t          j        |j                                      | _        t          j
        t          j        t          j        |j                                      | _        |j        | _        t'          |          | _        t+          |          | _        t/          |          | _        t+          |          | _        |                                  d S )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )r}   r~   r   text_configr   	TypeErrortypeaudio_configr   r   r   rB   rR  r   rQ  rR  rO  rS  r  ri  
text_modelr  text_projectionr\  audio_modelaudio_projectionr_  )ru   r   r  r  r   s       r)   r~   zClapModel.__init__.  su      &,n== 	0+,,0 0 0  
 &-?? 	1,--1 1 1  
 (*\%,tx@]7^7^*_*_``\%,tx@]7^7^*_*_``$3'442;??),77 3L A A 	r+   NrF   r   r  rM   c                     |                      |||          }|                     |j                  }t          j        |d          }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`ClapTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)rF   r   r  r0   r>   )r  r  r  F	normalize)ru   rF   r   r  text_outputstext_featuress         r)   get_text_featureszClapModel.get_text_featuresN  sV    4 48??\ 4C 4
 4
 ,,\-GHHMr:::r+   r  r  c                     |                      ||          }|                     |j                  }t          j        |d          }|S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Returns:
            audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
            applying the projection layer to the pooled output of [`ClapAudioModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
        >>> random_audio = torch.rand((16_000))

        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     audio_features = model.get_audio_features(**inputs)
        ```)r  r  r0   r>   )r  r  r  r  r  )ru   r  r  r   audio_outputsaudio_featuress         r)   get_audio_featureszClapModel.get_audio_featuresp  sV    > 594D4D)Y 5E 5
 5
 ..}/JKK^<<<r+   return_lossr   r  r  c
           	         ||n| j         j        }||n| j         j        }|	|	n| j         j        }	|                     ||||d          }
|                     |||||d          }|	s|
d         n|
j        }|                     |          }|	s|d         n|j        }|                     |          }||	                    ddd          z  }||	                    ddd          z  }| j
                                        }| j                                        }t          j        ||                                          |z  }t          j        ||                                          |z  }d}|r8t!          |          }t!          |                                          }||z   d	z  }t#          |||||||

          S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

        >>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"]

        >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)
        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        ```NTrc  rF   r   r  r   r  r  r   r-   r0   )r  r?   keepdimg       @)rj   rk   rl   rZ   rg   rm   rn   )r   r   r  rd  r  r  r  r  r  r   rS  exprO  rB   r   trV   ri   )ru   rF   r  r  r   r  r  r   r  r  r  r  rg   rZ   logit_scale_textlogit_scale_audiorl   rk   rj   caption_loss
audio_losss                        r)   r   zClapModel.forward  s   T 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B](()/!5 ) 
 
 )%/!5 ' 
 
 0;[}Q''@[,,\::-8Xl1ool>X**;77 $l&7&7!T&7&R&RR!K$4$4qb$$4$O$OO  -1133 .2244,{LNN4D4DEEHXX <kmmooFFIZZ 	5+O<<L)*:*<*<*>*>??J :-4D-+#%*,
 
 
 	
r+   )NNr|  )r]   r^   r_   r   rb   r~   r   r   rB   r  r   ra   r  r  r   
LongTensorrg  r  r   rc   ri   r   r   r   s   @r)   rN  rN  *  sL        z      @ %$&& 26/3	 < !. u|,	
 
	   ^ '&@ %$&& -115	# ## EL)# !.	#
 
	# # # ^ '&#J  156:041537&*,0/3&*]
 ]
E,-]
 !!23]
 E,-	]

 !.]
 u/0]
 d^]
 $D>]
 'tn]
 d^]
 
uj 	!]
 ]
 ]
 ^ ]
 ]
 ]
 ]
 ]
r+   rN  c                       e Zd ZU eed<   def fdZdej        fdZd Z	e
e	 	 	 	 	 	 ddeej                 deej                 d	eej                 d
ee         dee         dee         deeef         fd                        Z xZS )ClapTextModelWithProjectionr   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r|   )r}   r~   ri  r  r  r  r_  r  s     r)   r~   z$ClapTextModelWithProjection.__init__  sP       '//26::r+   rM   c                 $    | j         j        j        S r|   r  r  r  rx   s    r)   ra  z0ClapTextModelWithProjection.get_input_embeddings  s    )99r+   c                 (    || j         j        _        d S r|   r  rq  s     r)   rr  z0ClapTextModelWithProjection.set_input_embeddings  s    5:"222r+   NrF   r   r  r   r  r  c                     ||n| j         j        }|                     |||||d          }|s|d         n|j        }|                     |          }	t          |	|j        |j        |j                  S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection

        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```NTr  r   )rZ   r[   r#   r\   )	r   rd  r  r  r  rY   r[   r#   r\   )
ru   rF   r   r  r   r  r  r  rC  rZ   s
             r)   r   z#ClapTextModelWithProjection.forward
  s    2 &1%<kk$+B])%/!5 ' 
 
 0;ZQ@Z**=99"#*<&4#.	
 
 
 	
r+   )NNNNNN)r]   r^   r_   r   rb   r~   r   r  ra  rr  r   r   r   rB   r  r  r   rc   rY   r   r   r   s   @r)   r  r    s2        ~      :bi : : : :; ; ;  -115/3,0/3&*+
 +
EL)+
 !.+
 u|,	+

 $D>+
 'tn+
 d^+
 
u))	*+
 +
 +
 ^ +
 +
 +
 +
 +
r+   r  c                        e Zd ZU eed<   dZdef fdZdej        fdZ	e
e	 	 	 	 	 ddeej                 deej                 dee         d	ee         d
ee         deeef         fd                        Z xZS )ClapAudioModelWithProjectionr   r  c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r|   )r}   r~   r\  r  r  r  r_  r  s     r)   r~   z%ClapAudioModelWithProjection.__init__?  sQ       )&11 3F ; ;r+   rM   c                 .    | j         j        j        j        S r|   )r  r^  r  r   rx   s    r)   ra  z1ClapAudioModelWithProjection.get_input_embeddingsF  s    -9>>r+   Nr  r   r  r  c                 &   ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||d          }|s|d         n|j        }|                     |          }t          ||j        |j	        |j
                  S )av  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor

        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> audio_embeds = outputs.audio_embeds
        ```NTrc  r   )rg   r[   r\   r#   )r   rd  r   r  r  r  r  rf   r[   r\   r#   )	ru   r  r  r   r  r  r  rC  rg   s	            r)   r   z$ClapAudioModelWithProjection.forwardI  s    > &1%<kk$+B]1B1N--TXT_Tq$8$D  $+Jj 	 (()/!5 ) 
 
 1<\a((A\,,];;#%+=$/'5	
 
 
 	
r+   re  )r]   r^   r_   r   rb   rf  r~   r   r  ra  r   r   r   rB   ra   rg  r  r   rc   rf   r   r   r   s   @r)   r  r  :  s        &O      ?bi ? ? ? ?  7;04,0/3&*4
 4
 !234
 E,-4
 $D>	4

 'tn4
 d^4
 
u**	+4
 4
 4
 ^ 4
 4
 4
 4
 4
r+   r  )rN  rE  ri  r  r\  r  )r   )r   N)Vr`   r   r   dataclassesr   typingr   r   r   r   rB   torch.nn.functionalr   rS   r  activationsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   r   utilsr   r   r   r   r   r   configuration_clapr   r   r   
get_loggerr]   loggerr*   r:   r<   rK   r  rV   rY   rf   ri   r  rz   r   r   r   r  r  r.  r:  r?  rx  r  r  r  r  floatr  r
  r  r  r%  r*  r.  r7  r>  rE  r\  ri  rN  r  r  __all__rd   r+   r)   <module>r     s          ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1                 ! ! ! ! ! ! 9 9 9 9 9 9         
 G F F F F F F F v v v v v v v v v v v v w w w w w w w w w w w w w w w w K K K K K K K K K K 
	H	%	%  "  *  (4 4 4 4$7U\ 7el 7 7 7 7
   	? 	? 	? 	? 	?+ 	? 	?  	?   
	? 	? 	? 	? 	?; 	? 	?  	?  
  
  
  
  
  
  
   
H    29   2% % % % %	 % % %P_ _ _ _ _") _ _ _F\ \ \ \ \RY \ \ \@
 
 
 
 
") 
 
 
# # # # # # # #N    BI    	 	 	 	 	bi 	 	 	z z z z zRY z z z|9 9 9 9 9/ 9 9 9z3 3 3 3 3BI 3 3 3lB
 B
 B
 B
 B
ry B
 B
 B
J    ")   &V= V= V= V= V= V= V= V=B (,% %I%<% 
% <	%
 U\*% % % %% % % %87 7 7 7 7BI 7 7 7v       * * * * *	 * * *\    29        RY   % % % % %. % % %R.
 .
 .
 .
 .
bi .
 .
 .
d    RY    = = = = =/ = = =:8
 8
 8
 8
 8
( 8
 8
 8
v   b
 b
 b
 b
 b
' b
 b
 b
J K
 K
 K
 K
 K
# K
 K
 K
\ =
 =
 =
 =
 =
"5 =
 =
 =
@ D
 D
 D
 D
 D
#6 D
 D
 D
N  r+   