
     `i                        d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZmZmZmZmZ ddl m!Z!m"Z"m#Z#  ej$        e%          Z&de
j'        de
j'        fdZ(de
j'        de
j'        fdZ)ee G d de                                  Z*ee G d de                                  Z+ee G d de                                  Z, G d dej-                  Z. G d dej-                  Z/	 dDdej-        d e
j'        d!e
j'        d"e
j'        d#ee
j'                 d$e0d%e0fd&Z1 G d' d(ej-                  Z2 G d) d*ej-                  Z3 G d+ d,e          Z4e G d- d.e                      Z5 G d/ d0ej-                  Z6 G d1 d2ej-                  Z7 G d3 d4e5          Z8 G d5 d6ej-                  Z9 G d7 d8e5          Z:e G d9 d:e5                      Z; G d; d<ej-                  Z< G d= d>e5          Z= ed?@           G dA dBe5                      Z>g dCZ?dS )EzPyTorch CLIPSeg model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                     t           j                            | t          j        t          |           | j                            S )Ndevice)r   
functionalcross_entropytorcharangelenr    )r   s    /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_lossr'   '   s3    =&&vu|CKKPVP]/^/^/^___    
similarityc                 r    t          |           }t          |                                           }||z   dz  S )Ng       @)r'   t)r)   caption_loss
image_losss      r&   clipseg_lossr.   ,   s4    #J//L!*,,..11J:%,,r(   c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeed<   dZeed	<   d
ee         fdZdS )CLIPSegOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))r6   r7   Ngetattrto_tuple.0kselfs     r&   	<genexpr>z)CLIPSegOutput.to_tuple.<locals>.<genexpr>R   sc       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
r(   tuplekeysr@   s   `r&   r<   zCLIPSegOutput.to_tupleQ   C     
 
 
 
YY[[
 
 
 
 
 	
r(   )__name__
__module____qualname____doc__r1   r   r#   FloatTensor__annotations__r2   r3   r4   r5   r6   r   r7   rC   r   r<    r(   r&   r0   r0   2   s          & )-D(5$
%,,,48hu0188837OXe/0777/3K%+,33304L(5,-4444818886:3:::
%* 
 
 
 
 
 
r(   r0   c                       e Zd ZU dZdZeej                 ed<   dZ	ee
ej                          ed<   dZee
ej                          ed<   dS )CLIPSegDecoderOutputz|
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    Nr   hidden_states
attentions)rG   rH   rI   rJ   r   r   r#   rK   rL   rP   rC   rQ   rM   r(   r&   rO   rO   X   sp          
 +/FHU&'...8<M8E%"345<<<59Ju01299999r(   rO   c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeed<   dZeed<   d	ee         fd
ZdS )CLIPSegImageSegmentationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Binary cross entropy loss for segmentation.
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
        Conditional embeddings used for segmentation.
    pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
        Pooled output of the [`CLIPSegVisionModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    decoder_output (`CLIPSegDecoderOutput`):
        The output of the [`CLIPSegDecoder`].
    Nr1   r   conditional_embeddingspooled_outputr7   decoder_outputr   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))r7   rV   Nr:   r=   s     r&   rA   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>   sb       
 
  IIIDGGwW[]^O_O_OhOhOjOj
 
 
 
 
 
r(   rB   rE   s   `r&   r<   z'CLIPSegImageSegmentationOutput.to_tuple~   rF   r(   )rG   rH   rI   rJ   r1   r   r#   rK   rL   r   rT   rU   r7   r   rV   rO   rC   r   r<   rM   r(   r&   rS   rS   e   s           )-D(5$
%,,,*.FHU&'...:>HU%67>>>15M8E-.5556:3:::+/N(///
%* 
 
 
 
 
 
r(   rS   c                   v     e Zd Zdef fdZdej        dededej        fdZdd	ej	        dej        fd
Z
 xZS )CLIPSegVisionEmbeddingsconfigc                 z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__r[   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr#   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr$   expandr@   r[   	__class__s     r&   ri   z CLIPSegVisionEmbeddings.__init__   s   + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr(   
embeddingsheightwidthr   c                    |j         d         dz
  }| j        j                            d          }|j         d         dz
  }t          j                                        s&||k    r ||k    r|                     | j                  S |ddddf         }|ddddf         }|j         d         }	|| j        z  }
|| j        z  }t          |dz            }|
                    d|||	          }|                    dddd          }t          j                            ||
|fdd	
          }|                    dddd                              dd|	          }t	          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nre   g      ?r	   rb   bicubicF)sizemodealign_cornersdim)shaperw   weight	unsqueezer#   jit
is_tracingrc   rm   r   reshapepermuter   r!   interpolateviewcat)r@   r|   r}   r~   rt   rw   ru   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r&   interpolate_pos_encodingz0CLIPSegVisionEmbeddings.interpolate_pos_encoding   s    !&q)A-!4;EEaHH*03a7 y##%% 	>+*F*F6UZ??**4+<===,QQQU3,QQQU3r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr(   Tpixel_valuesc                    |j         \  }}}}|s<|| j        k    s|| j        k    r&t          d| d| d| j         d| j         d	          |                     |          }|                    d                              dd          }| j                            |dd          }t          j	        ||gd          }	|r|	| 
                    |	||          z   }	n|	|                     | j                  z   }	|	S )	NzInput image size (*z) doesn't match model ().rb   r   re   r   )r   rl   
ValueErrorrs   flatten	transposerp   ry   r#   r   r   rw   rc   )
r@   r   r   
batch_size_r}   r~   patch_embedsclass_embedsr|   s
             r&   forwardzCLIPSegVisionEmbeddings.forward   s(   '3'9$
Avu' 	Vt-F-F%SWSbJbJbqVqqeqqDOqq^b^mqqq   ++L99#++A..88A>>+22:q"EEYl;CCC
# 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr(   T)rG   rH   rI   r   ri   r#   Tensorintr   rK   r   __classcell__r{   s   @r&   rZ   rZ      s        q2 q q q q q q,'D5< 'D 'DUX 'D]b]i 'D 'D 'D 'DR E$5 Y^Ye        r(   rZ   c            	            e Zd Zdef fdZ	 	 	 d	deej                 deej                 deej                 dej	        fdZ
 xZS )
CLIPSegTextEmbeddingsr[   c                 V   t                                                       |j        }t          j        |j        |          | _        t          j        |j        |          | _        | 	                    dt          j        |j                                      d          d           d S )Nrc   rd   Frf   )rh   ri   rj   r   rv   
vocab_sizetoken_embeddingmax_position_embeddingsrw   rx   r#   r$   ry   r@   r[   rk   r{   s      r&   ri   zCLIPSegTextEmbeddings.__init__   s    &	!|F,=yII"$,v/My"Y"Y 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r(   N	input_idsrc   inputs_embedsr   c                 .   ||j         d         n|j         d         }| j        j        j         d         }||k    rt          d| d|           || j        d d d |f         }||                     |          }|                     |          }||z   }|S )Nre   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   rw   r   r   rc   r   )r@   r   rc   r   
seq_lengthmax_position_embeddingposition_embeddingsr|   s           r&   r   zCLIPSegTextEmbeddings.forward   s     -6,AY_R((}GZ[]G^
!%!8!?!Ea!H...VV V=SV V  
 ,QQQ^<L  00;;M"55lCC"%88
r(   )NNN)rG   rH   rI   r   ri   r   r#   
LongTensorrK   r   r   r   r   s   @r&   r   r      s        

0 

 

 

 

 

 

 153759	 E,- u/0   12	
 
       r(   r           modulequerykeyvalueattention_maskscalingdropoutc                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nre   r   )r   dtype)ptrainingr   rb   )r#   matmulr   r   r!   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r&   eager_attention_forwardr     s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r(   c                        e Zd ZdZdeeef         f fdZ	 	 	 ddej	        de
ej	                 de
ej	                 d	e
e         d
eej	        e
ej	                 f         f
dZ xZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr[   c                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rh   ri   r[   rj   rk   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrz   s     r&   ri   zCLIPSegAttention.__init__  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr(   NFrP   r   causal_attention_maskoutput_attentionsr   c           
      n   |j         \  }}}|                     |          }|                     |          }	|                     |          }
|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	|
                    ||| j        | j                                      dd          }
| j        j	        dk    r||||z   }n||}n	|du| _
        t          }| j        j	        dk    rt          | j        j	                 } || ||	|
|| j
        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }|sd}||fS )z#Input shape: Batch x Time x Channelr   rb   flash_attention_2Neagerr   )r   r   r   )r   r   r   r   r   r   r   r   r[   _attn_implementationr   r   r   r   r   r   r   r   r   )r@   rP   r   r   r   r   r   rk   queriesrD   valuesattention_interfacer   r   s                 r&   r   zCLIPSegAttention.forward0  s    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc ;+/BBB).C.O!/2G!G&2!62$>DN(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00  	 LL((r(   )NNF)rG   rH   rI   rJ   r   r   r   ri   r#   r   r   boolrC   r   r   r   s   @r&   r   r     s        GGBu%8:K%KL B B B B B B. 268<,10) 0)|0) !.0)  (5	0)
 $D>0) 
u|Xel33	40) 0) 0) 0) 0) 0) 0) 0)r(   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )
CLIPSegMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S N)rh   ri   r[   r
   
hidden_actactivation_fnr   r   rj   intermediate_sizefc1fc2rz   s     r&   ri   zCLIPSegMLP.__init__e  sf    #F$569V/1IJJ9V5v7IJJr(   rP   r   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   )r@   rP   s     r&   r   zCLIPSegMLP.forwardl  s=    //**=99//r(   )rG   rH   rI   ri   r#   r   r   r   r   s   @r&   r   r   d  sc        K K K K KU\ el        r(   r   c                        e Zd Zdef fdZ	 d
dej        dej        dej        dee         de	ej
                 f
d	Z xZS )CLIPSegEncoderLayerr[   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S N)epsrh   ri   rj   rk   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rz   s     r&   ri   zCLIPSegEncoderLayer.__init__u      +)&11<F<QRRRf%%<F<QRRRr(   FrP   r   r   r   r   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rP   r   r   r   )r   r   r   r   r@   rP   r   r   r   residualr   outputss           r&   r   zCLIPSegEncoderLayer.forward}  s    " !((77&*nn')"7/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr(   F)rG   rH   rI   r   ri   r#   r   r   r   rC   rK   r   r   r   s   @r&   r   r   t  s        S} S S S S S S -2& &|& &  %|	&
 $D>& 
u 	!& & & & & & & &r(   r   c                   (    e Zd ZU eed<   dZdZd ZdS )CLIPSegPreTrainedModelr[   clipTc                 4   | j         j        }t          |t                    rT|j        j        j                            d|dz             |j        j        j                            d|dz             nt          |t                    r| j         j        }t          j                            |j        d|j        dz  |z             t          j                            |j        j        |j         j        |z             t          j                            |j        j        |j         j        |z             nCt          |t                     r| j         j        }|j        dz  d|j         j        z  dz  z  |z  }|j        dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           nGt          |t,                    r| j         j        }|j         j        dz  d|j         j        z  dz  z  |z  }d|j         j        z  dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           nt          |t4                    rt          j                            |j        j        |j        dz  | j         j        z             t          j                            |j        j        |j        dz  | j         j        z             t          |t          j                  r=|j         j        !                                 |j        j        "                    d           t          |t          j#                  r'|j         "|j         j        !                                 dS dS dS )	zInitialize the weightsr   g{Gz?)meanstdr   )r  rb   g      ?N)$r[   initializer_factor
isinstancer   r   r   datanormal_rw   rZ   r   initrp   rk   rs   initializer_ranger   num_hidden_layersr   r   r   r   r   rj   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   ra   zero_fill_r   )r@   r   factorin_proj_stdout_proj_stdfc_stds         r&   _init_weightsz$CLIPSegPreTrainedModel._init_weights  s   /f344 	").66CVd]6SSS%,199sQU9VVVV 788 	[3FGOOF2&BRTXBX[aBaObbbGOOF29v}?^ag?gOhhhGOOF5<&-BadjBjOkkkk 011 	[3F!+T1q6=;Z7Z_c6cdgmmK",d2f<LGOOFM0kOBBBGOOFM0kOBBBGOOFM0kOBBBGOOFO2OEEEE
++ 	[3F!=4d:FMDc@chl?lmpvvK&-33<vEFGOOFJ-6O:::GOOFJ-;O????-- 	GOO&-)4/$+2PP     GOO(/+T1DK4RR    
 fbl++ 	*K""$$$M$$S)))fbi(( 	%V[-DK""$$$$$	% 	%-D-Dr(   N)rG   rH   rI   r   rL   base_model_prefixsupports_gradient_checkpointingr  rM   r(   r&   r  r    s=         &*#'% '% '% '% '%r(   r  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej	                 deej	                 dee
         dee
         d	ee
         d
eeef         fd            Z xZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    r[   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rM   )r   r>   r   r[   s     r&   
<listcomp>z+CLIPSegEncoder.__init__.<locals>.<listcomp>  s"    $j$j$jQ%8%@%@$j$j$jr(   F)	rh   ri   r[   r   
ModuleListranger  layersgradient_checkpointingrz   s    `r&   ri   zCLIPSegEncoder.__init__  sa    m$j$j$j$j%PVPhJiJi$j$j$jkk&+###r(   Nr   r   r   output_hidden_statesreturn_dictr   c                 @   ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]2\  }
}|r||	fz   } ||	|||          }|d         }	|r||d         fz   }3|r||	fz   }t          |	||          S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrM   )r   r   r   )last_hidden_staterP   rQ   )r[   r   r&  use_return_dict	enumerater$  r   )r@   r   r   r   r   r&  r'  encoder_statesall_attentionsrP   idxencoder_layerlayer_outputss                r&   r   zCLIPSegEncoder.forward  s    N 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B)M%"3	  M *!,M  F!/=3C2E!E 	?+}.>>N+>Vd
 
 
 	
r(   NNNNN)rG   rH   rI   rJ   r   ri   r   r   r#   r   r   r   rC   r   r   r   r   s   @r&   r  r    s         ,} , , , , , ,  268<,0/3&*C
 C
 !.C
  (5	C

 $D>C
 'tnC
 d^C
 
uo%	&C
 C
 C
 C
 C
 C
 C
 C
r(   r  c                        e Zd Zdef fdZe	 	 	 	 	 	 ddeej                 deej                 deej                 dee	         dee	         d	ee	         d
e
eef         fd            Z xZS )CLIPSegTextTransformerr[   c                    t                                                       || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        |j        | _        d S r   )rh   ri   r[   rj   r   r|   r  encoderr   r   r   final_layer_normeos_token_idr   s      r&   ri   zCLIPSegTextTransformer.__init__.  ss    &	/77%f-- "YF<Q R R R #/r(   Nr   r   rc   r   r&  r'  r   c                 *   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                                }|                    d|d                   }|                     ||          }t          ||j	        |j
                  }	|t          ||j	                  }|                     |||	|||          }
|
d         }|                     |          }| j        dk    rg|t          j        |j        d         |j
                  |                    t          j        |j
                                      d	          f         }n|t          j        |j        d         |j
                  |                    t          j        |j
                  | j        k                                                        d	          f         }|s||f|
d
d          z   S t+          |||
j        |
j                  S )NzYou have to specify input_idsre   )r   rc   r   )r   r   r   r   r&  r'  r   rb   )r   r    r   r   r)  pooler_outputrP   rQ   )r[   r   r&  r*  r   r   r   r|   r   r   r    r   r5  r6  r7  r#   r$   r   r   r   argmaxr   rP   rQ   )r@   r   r   rc   r   r&  r'  input_shaperP   r   encoder_outputsr)  rU   s                r&   r   zCLIPSegTextTransformer.forward9  sH    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]<===nn&&NN2{277	),WW !A,]5I!
 !
 !
 %7H[\\N,,')"7/!5# ' 
 
 ,A. 112CDD!! ..4Q7@Q@XYYY595F5MNNUUZ\U]]_MM ..4Q7@Q@XYYY EI6G6NOOSWSddB!M  	L%}58KKK)/')7&1	
 
 
 	
r(   NNNNNN)rG   rH   rI   r   ri   r   r   r#   r   r   r   rC   r   r   r   r   s   @r&   r3  r3  -  s        	00 	0 	0 	0 	0 	0 	0  -115/3,0/3&*K
 K
EL)K
 !.K
 u|,	K

 $D>K
 'tnK
 d^K
 
u00	1K
 K
 K
 ^K
 K
 K
 K
 K
r(   r3  c                       e Zd ZU eed<   ddgZdef fdZdej        fdZ	d Z
e	 	 	 	 	 	 dd	eej                 d
eej                 deej                 dee         dee         dee         deeef         fd            Z xZS )CLIPSegTextModelr[   r   r   c                     t                                          |           t          |          | _        |                                  d S r   )rh   ri   r3  
text_model	post_initrz   s     r&   ri   zCLIPSegTextModel.__init__  s@       088r(   r   c                 $    | j         j        j        S r   rB  r|   r   rE   s    r&   get_input_embeddingsz%CLIPSegTextModel.get_input_embeddings  s    )99r(   c                 (    || j         j        _        d S r   rE  )r@   r   s     r&   set_input_embeddingsz%CLIPSegTextModel.set_input_embeddings  s    5:"222r(   Nr   r   rc   r   r&  r'  c                 8    |                      ||||||          S )a;  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rc   r   r&  r'  )rB  )r@   r   r   rc   r   r&  r'  s          r&   r   zCLIPSegTextModel.forward  s1    2 )%/!5#  
 
 	
r(   r>  )rG   rH   rI   r   rL   _no_split_modulesri   r   ModulerF  rH  r   r   r#   r   r   r   rC   r   r   r   r   s   @r&   r@  r@    s4        02GH0      :bi : : : :; ; ;  -115/3,0/3&*
 
EL)
 !.
 u|,	

 $D>
 'tn
 d^
 
u00	1
 
 
 ^
 
 
 
 
r(   r@  c                        e Zd Zdef fdZe	 	 	 	 ddeej                 dee	         dee	         dee	         d	ee	         d
e
eef         fd            Z xZS )CLIPSegVisionTransformerr[   c                 4   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |          | _        t          j        ||j                  | _        d S r   )rh   ri   r[   rj   rZ   r|   r   r   r   pre_layrnormr  r5  post_layernormr   s      r&   ri   z!CLIPSegVisionTransformer.__init__  s    &	1&99L8MNNN%f-- l9&:OPPPr(   NTr   r   r&  r'  r   r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||          }|                     |          }|                     ||||          }|d         }|d d dd d f         }	|                     |	          }	|s||	f|dd          z   S t          ||	|j	        |j
                  S )N)r   )r   r   r&  r'  r   r   r9  )r[   r   r&  r*  r|   rP  r5  rQ  r   rP   rQ   )
r@   r   r   r&  r'  r   rP   r=  r)  rU   s
             r&   r   z CLIPSegVisionTransformer.forward  s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]Oghh))-88,,'/!5#	 ' 
 
 ,A.)!!!Q'2++M:: 	L%}58KKK)/')7&1	
 
 
 	
r(   )NNNT)rG   rH   rI   r   ri   r   r   r#   rK   r   r   rC   r   r   r   r   s   @r&   rN  rN    s        Q2 Q Q Q Q Q Q  -1/3&*37$
 $
u01$
 $D>$
 'tn	$

 d^$
 #+4.$
 
u00	1$
 $
 $
 ^$
 $
 $
 $
 $
r(   rN  c                        e Zd ZU eed<   dZdef fdZdej        fdZ	e
	 	 	 	 	 ddeej                 dee         d	ee         d
ee         dee         deeef         fd            Z xZS )CLIPSegVisionModelr[   r   c                     t                                          |           t          |          | _        |                                  d S r   )rh   ri   rN  vision_modelrC  rz   s     r&   ri   zCLIPSegVisionModel.__init__  sA       4V<<r(   r   c                 $    | j         j        j        S r   )rV  r|   rs   rE   s    r&   rF  z'CLIPSegVisionModel.get_input_embeddings  s     +;;r(   NTr   r&  r   r'  c                 6    |                      |||||          S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r&  r   r'  )rV  )r@   r   r   r&  r   r'  s         r&   r   zCLIPSegVisionModel.forward  s0    :   %/!5%=# ! 
 
 	
r(   )NNNTN)rG   rH   rI   r   rL   main_input_nameri   r   rL  rF  r   r   r#   rK   r   r   rC   r   r   r   r   s   @r&   rT  rT    s        $O2      <bi < < < <  59,0/337&*"
 "
u01"
 $D>"
 'tn	"

 #+4."
 d^"
 
u00	1"
 "
 "
 ^"
 "
 "
 "
 "
r(   rT  c                       e Zd ZU eed<   def fdZ e            e	 	 ddej	        de
ej	                 de
ej	                 dej        fd                        Z e            e	 dd
ej        dedej        fd                        Ze	 	 	 	 	 	 	 	 	 dde
ej                 d
e
ej                 de
ej	                 de
ej                 de
e         de
e         de
e         dede
e         deeef         fd            Z xZS )r  r[   c                 T   t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          |j        }|j        }|j	        |_	        |j	        |_	        |j
        | _
        |j        | _        |j        | _        t          |          | _        t!          |          | _        t%          j        | j        | j
        d          | _        t%          j        | j        | j
        d          | _        t%          j        t/          j        | j        j                            | _        |                                  d S )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)ra   )rh   ri   r  text_configr   	TypeErrortypevision_configr   r   projection_dimrj   r  r  r3  rB  rN  rV  r   r   r  r  rn   r#   tensorr[   logit_scale_init_valuelogit_scalerC  )r@   r[   r^  ra  r{   s       r&   ri   zCLIPSegModel.__init__'  s      &,.?@@ 	0+,,0 0 0  
 &.0CDD 	2-..2 2 2  
 (,+1+F(-3-H*$3)5 - 90==4]CC!#4+@$BU\a!b!b!b!y)<d>QX]^^^<T[5W(X(XYY 	r(   Nr   r   rc   r   c                 n    |                      |||          }|j        }|                     |          }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)r   r   rc   )rB  r:  r  )r@   r   r   rc   text_outputsrU   text_featuress          r&   get_text_featureszCLIPSegModel.get_text_featuresK  sI    4 48??)% 4C 4
 4

 %2,,];;r(   Tr   r   c                 l    |                      ||          }|j        }|                     |          }|S )an  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)r   r   )rV  r:  r  )r@   r   r   vision_outputsrU   image_featuress         r&   get_image_featureszCLIPSegModel.get_image_featureso  sH    < 6:5F5F%%= 6G 6
 6
 '4//>>r(   return_lossr   r&  r'  c
           	         ||n| j         j        }||n| j         j        }|	|	n| j         j        }	|                     |||||	          }
|                     ||||||	          }|
d         }|                     |          }|d         }|                     |          }||                    ddd          z  }||                    ddd          z  }| j	        
                                }t          j        ||                                          |z  }|                                }d}|rt          |          }|	s||||||
f}||f|z   n|S t          |||||||
	          S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NrY  rJ  r   rb   re   T)r   r   keepdim)r1   r2   r3   r4   r5   r6   r7   )r[   r   r&  r*  rV  rB  r  r  normre  expr#   r   r+   r.   r0   )r@   r   r   r   rc   rn  r   r&  r   r'  rk  rg  r5   r4   re  r3   r2   r1   outputs                      r&   r   zCLIPSegModel.forward  s   P 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]**%/!5%=# + 
 
 )%/!5# ' 
 
 &a(--l;;"1o**;77 $l&7&7!T&7&R&RR!K$4$4qb$$4$O$OO &**,,,{LNN4D4DEES*,,.. 	100D 	F&lT`bpqF)-)9TGf$$vE-+#%* .
 
 
 	
r(   )NNr   )	NNNNNNNTN)rG   rH   rI   r   rL   ri   r   r   r#   r   r   rK   ri  r   rm  r   r   rC   r0   r   r   r   s   @r&   r  r  #  s        "} " " " " " "H %$&& 26/3	   <  !.  u|,	 
 
	      ^ '& D %$&& *.# #'# #'# 
		# # # ^ '&#J  15481537&*,0/3)-&*]
 ]
E,-]
 u01]
 !.	]

 u/0]
 d^]
 $D>]
 'tn]
 #']
 d^]
 
um#	$]
 ]
 ]
 ^]
 ]
 ]
 ]
 ]
r(   r  c                        e Zd ZdZdef fdZ	 ddej        dej        dej        dee	         d	e
ej                 f
d
Z xZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    r[   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S r   r   rz   s     r&   ri   zCLIPSegDecoderLayer.__init__  r   r(   FrP   r   r   r   r   c                     |}|                      ||||          \  }}||z   }|                     |          }|}|                     |          }||z   }|                     |          }|f}|r||fz  }|S r   )r   r   r   r   r   s           r&   r   zCLIPSegDecoderLayer.forward  s    " !&*nn')"7/	 '5 '
 '
#| !=0((77 // =0((77 " 	'&Gr(   r   )rG   rH   rI   rJ   r   ri   r#   r   r   r   rC   rK   r   r   r   s   @r&   ru  ru    s         S} S S S S S S -2' '|' '  %|	'
 $D>' 
u 	!' ' ' ' ' ' ' 'r(   ru  c                        e Zd Zdef fdZ	 	 	 ddeej                 dej        dee	         dee	         d	ee	         f
d
Z
 xZS )CLIPSegDecoderr[   c                    t                                                     j        | _        t          j        j        j                  | _        t          j        j        j                  | _        j	        r׉j
        j        dz  j
        j        dz  f}t          j        t          j        j        j        dd          t          j                    t          j        j        j        dz  |d         |d                   t          j                    t          j        j        dz  d|d         |d                             | _        n6t          j        j        dj
        j        j
        j                  | _        t#          j                  }t          j        fd	t)          |          D                       | _        t-          j        j
                  j        _        j        _        j        _        d
_        t          j        fdt)          t#          j                            D                       | _        d S )N   r	   r   )r_   paddingrb   r   )r_   r`   )r`   c                 X    g | ]&}t          j        j        j        j                  'S rM   )r   r   ra  rj   
reduce_dimr   s     r&   r!  z+CLIPSegDecoder.__init__.<locals>.<listcomp>Q  s/    bbbPQRYv+79JKKbbbr(   reluc                 .    g | ]}t                    S rM   )ru  )r>   r   decoder_configs     r&   r!  z+CLIPSegDecoder.__init__.<locals>.<listcomp>Y  s"    $t$t$tQ%8%H%H$t$t$tr(   )rh   ri   conditional_layerr   r   rb  r~  film_mulfilm_add"use_complex_transposed_convolutionra  rm   
Sequentialrq   ReLUConvTranspose2dtransposed_convolutionr%   extract_layersr"  r#  reducescopydeepcopyrj   decoder_num_attention_headsr   decoder_intermediate_sizer   r   r$  )r@   r[   transposed_kernelsdepthr  r{   s    `  @r&   ri   zCLIPSegDecoder.__init__1  s/      !'!9	&"79JKK	&"79JKK4 	"("6"AQ"FH\HgklHl!m*,-	&+V->AWXYYY		"%%* 21 5-a0	   		"%*A;Ma;PYklmYn  + +D'' +-*<!1f&:&EfNbNm+ + +D' F)**}bbbbUZ[`UaUabbb
 
 v';<<%+%6"-3-O*+1+K($*!m$t$t$t$tRWX[\b\qXrXrRsRs$t$t$tuur(   NTrP   rT   r   r&  r'  c                    |rdnd }|rdnd }|d d d         }d }	t          t          || j        | j                            D ]\  }
\  }}}|	 ||          |	z   }	n ||          }	|
| j        k    rZ|                     |          |	                    ddd          z  |                     |          z   }	|	                    ddd          }	 ||	d d |          }|d         }	|r||	fz  }|r||d         fz  }|	d d dd d d f                             ddd          }	t          t          j
        |	j        d                             }|j        d         }|	                    ||	j        d         ||          }	|                     |	                              d          }|st          d |||fD                       S t!          |||          S )	NrM   re   r   r   rb   )r   r   r   c              3      K   | ]}||V  	d S r   rM   )r>   vs     r&   rA   z)CLIPSegDecoder.forward.<locals>.<genexpr>  s(      aaqSTS`S`S`S`S`aar(   )r   rP   rQ   )r+  zipr$  r  r  r  r   r  r   mathsqrtr   r   r  squeezerC   rO   )r@   rP   rT   r   r&  r'  all_hidden_statesr-  activationsrs  i
activationlayerreducer0  r   r   r   s                     r&   r   zCLIPSegDecoder.forward[  sB    #7@BBD0:d#DDbD).7KVZVb8c8c.d.d 	6 	6*A*
E6!
++f4
++D***'=>>PQSTVWAXAXX[_[h[h*\ \   1a00!Et4[l  M #1%F# /!fY.!  6=#3"55122qqq!))!Q2249V\!_--..+1!4
Za$EE,,V44<<Q?? 	baaV->$Oaaaaaa#+%
 
 
 	
r(   )NNT)rG   rH   rI   r   ri   rC   r#   r   r   r   r   r   r   s   @r&   ry  ry  0  s        (v} (v (v (v (v (v (v\ -1/3&*6
 6
U\*6
 !&6
 $D>	6

 'tn6
 d^6
 6
 6
 6
 6
 6
 6
 6
r(   ry  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                       e Zd ZU eed<   def fdZ	 	 	 	 	 ddee         deej	                 deej	                 deej	                 deej	                 f
d	Z
e	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej	                 deej                 deej                 dee         dee         dedee         deeef         fd            Z xZS )CLIPSegForImageSegmentationr[   c                     t                                          |           || _        t          |          | _        |j        | _        t          |          | _        |                                  d S r   )	rh   ri   r[   r  r  r  ry  decoderrC  rz   s     r&   ri   z$CLIPSegForImageSegmentation.__init__  sc        ((	$3%f-- 	r(   Nr   r   r   rc   conditional_pixel_valuesc                    |kt          |          |k    rt          d          t          j                    5  | j                            |||          }d d d            n# 1 swxY w Y   ny|ht          |          |k    rt          d          t          j                    5  | j                            |          }d d d            n# 1 swxY w Y   nt          d          |S )Nz@Make sure to pass as many prompt texts as there are query images)r   rc   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r%   r   r#   no_gradr  ri  rm  )r@   r   r   r   rc   r  rT   s          r&   get_conditional_embeddingsz6CLIPSegForImageSegmentation.get_conditional_embeddings  s     9~~++ !cddd  )-)D)Dn< *E * *&               &1+,,
:: !deee ` `)-)E)EF^)_)_&` ` ` ` ` ` ` ` ` ` ` ` ` ` ` m   &%s#   A""A&)A&%CCCTr   rT   labelsr   r&  r   r'  r   c                    ||n| j         j        }t          j                    5  | j                            ||d|
|          }| j                            |d                   }|r|j        n|d         fd| j        D             }|r,t          |j
        |j        |	r|j        nd|j                  }n|	s|dd         |dd         z   n|}ddd           n# 1 swxY w Y   |&|                     |j        d	         ||||
          }nU|j        d	         |j        d	         k    rt          d          |j        d         | j         j        k    rt          d          |                     ||||	|          }|r|j        n|d	         }d}|9|                    |j                  }t+          j                    } |||          }|s|||||f}||f|z   n|S t/          ||||||          S )a~  
        conditional_pixel_values (`torch.FloatTensor`, *optional*):
            The pixel values of the conditional images.
        conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
            The conditional embeddings for the query images. If provided, the model will use this instead of computing
            the embeddings from the conditional_pixel_values.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTrY  r   rb   c                 &    g | ]}|d z            S )r   rM   )r>   r  rP   s     r&   r!  z7CLIPSegForImageSegmentation.forward.<locals>.<listcomp>  s"    MMMA=Q/MMMr(   r9  r	   r   )r   r   r   rc   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r&  r'  )r1   r   rT   rU   r7   rV   )r[   r*  r#   r  r  rV  r  rP   r  r   r)  r:  rQ   r  r   r   rb  r  r   r   r    r   BCEWithLogitsLossrS   )r@   r   r   r  rT   r   rc   r  r   r&  r   r'  rk  rU   r  decoder_outputsr   r1   loss_fnrs  rP   s                       @r&   r   z#CLIPSegForImageSegmentation.forward  s   b &1%<kk$+B] ]__ 	 	!Y33)"3%))A' 4  N !I77q8IJJM<G^N88^\]M^MMMMM9LMMMK  
!;&4&F"0">BV"`.">">\`-8	" " " DXkN2A2&);;;]k /	 	 	 	 	 	 	 	 	 	 	 	 	 	 	8 ")%)%D%D'-a0#-))A &E & &"" &+A.,2DQ2GGG m   &+A.$+2LLL 0   ,,"/!5# ' 
 
 ,7N''OA<NYYv}--F*,,G766**D 	F4m^UdeF)-)9TGf$$vE-#9' .*
 
 
 	
s   B+CC #C r1  )NNNNNNNNNTN)rG   rH   rI   r   rL   ri   r   r   r#   r   r  r   rK   r   r   r   rC   r0   r   r   r   s   @r&   r  r    s         }       %),015/3;?& &SM& EL)& !.	&
 u|,& #+5<"8& & & &:  2648@D>B1537-1,0/3)-&*~
 ~
E-.~
 u01~
 #+5+<"=	~

 !)): ;~
 !.~
 u/0~
 )*~
 $D>~
 'tn~
 #'~
 d^~
 
um#	$~
 ~
 ~
 ^~
 ~
 ~
 ~
 ~
r(   r  )r  r  r@  rT  r  )r   )@rJ   r  r  dataclassesr   typingr   r   r   r   r#   r   r  r
   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   r   r   r   configuration_clipsegr   r   r   
get_loggerrG   loggerr   r'   r.   r0   rO   rS   rL  rZ   r   floatr   r   r   r   r  r  r3  r@  rN  rT  r  ru  ry  r  __all__rM   r(   r&   <module>r     s       ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1        ! ! ! ! ! ! d d d d d d d d 9 9 9 9 9 9 K K K K K K K K F F F F F F F F w w w w w w w w w w w w w w w w X X X X X X X X X X 
	H	%	%
`U\ `el ` ` ` `
-U\ -el - - - -  
  
  
  
  
K  
  
   
F : : : : :; : :  : 
 
 
 
 
[ 
 
  
<P P P P Pbi P P Ph% % % % %BI % % %` % %I%<% 
% <	%
 U\*% % % % % %.G) G) G) G) G)ry G) G) G)V        / / / / /4 / / /d ,% ,% ,% ,% ,%_ ,% ,% ,%`S
 S
 S
 S
 S
RY S
 S
 S
lX
 X
 X
 X
 X
RY X
 X
 X
v1
 1
 1
 1
 1
- 1
 1
 1
h1
 1
 1
 1
 1
ry 1
 1
 1
h0
 0
 0
 0
 0
/ 0
 0
 0
f P
 P
 P
 P
 P
) P
 P
 P
f6 6 6 6 6") 6 6 6ra
 a
 a
 a
 a
+ a
 a
 a
H   
l
 l
 l
 l
 l
"8 l
 l
 
l
^  r(   