
     `iz)                        d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
mZ dd	lmZ d
dlmZ e ed           G d de
                                  Ze G d de                      Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z ed           G d de                      ZddgZdS )zPyTorch ViTMatte model.    )	dataclass)OptionalN)nn   )PreTrainedModel)ModelOutputauto_docstring)load_backbone   )VitMatteConfigz4
    Class for outputs of image matting models.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )ImageMattingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss.
    alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Estimated alpha values.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    Nlossalphashidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/vitmatte/modeling_vitmatte.pyr   r      s         	 	 )-D(5$
%,,,*.FHU&'...8<M8E%"345<<<59Ju01299999r   r   c                   <    e Zd ZU eed<   dZdZg Zdej	        fdZ
dS )VitMattePreTrainedModelconfigpixel_valuesTmodulec                     t          |t          j        t          j        f          rR|j        j                            d| j        j                   |j	        "|j	        j        
                                 d S d S d S )Ng        )meanstd)
isinstancer   Conv2dBatchNorm2dweightdatanormal_r!   initializer_rangebiaszero_)selfr#   s     r   _init_weightsz%VitMattePreTrainedModel._init_weights<   su    fry".9:: 	)M&&CT[5R&SSS{& &&(((((	) 	)&&r   N)r   r   r   r   r   main_input_namesupports_gradient_checkpointing_no_split_modulesr   Moduler1   r   r   r   r    r    5   sP         $O&*#)BI ) ) ) ) ) )r   r    c                   *     e Zd ZdZd fd	Zd Z xZS )VitMatteBasicConv3x3zP
    Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
       r   c                     t                                                       t          j        ||d||d          | _        t          j        ||j                  | _        t          j                    | _	        d S )Nr   F)in_channelsout_channelskernel_sizestridepaddingr.   )eps)
super__init__r   r(   convr)   batch_norm_eps
batch_normReLUrelu)r0   r!   r:   r;   r=   r>   	__class__s         r   rA   zVitMatteBasicConv3x3.__init__H   so    I#%
 
 
	 .6;PQQQGII			r   c                     |                      |          }|                     |          }|                     |          }|S N)rB   rD   rF   r0   hidden_states     r   forwardzVitMatteBasicConv3x3.forwardU   s;    yy..|44yy..r   )r8   r   r   r   r   r   rA   rL   __classcell__rG   s   @r   r7   r7   C   sV                    r   r7   c                   (     e Zd ZdZ fdZd Z xZS )VitMatteConvStreamzc
    Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
    c                    t                                                       d}|j        |j        j        }|j        }t          j                    | _        |g|z   | _        t          t          | j                  dz
            D ]H}| j        |         }| j        |dz            }| j                            t          |||                     Id S )N   r   )r@   rA   backbone_confignum_channelsconvstream_hidden_sizesr   
ModuleListconvs
conv_chansrangelenappendr7   )r0   r!   r:   r;   iin_chan_	out_chan_rG   s          r   rA   zVitMatteConvStream.__init__b   s     !- 0=K5]__
&-,6s4?++a/00 	Q 	QAq)HA.IJ268YOOPPPP	Q 	Qr   c                     d|i}|}t          t          | j                            D ]2} | j        |         |          }dt          |dz             z   }|||<   3|S )Ndetailed_feature_map_0detailed_feature_map_r   )rZ   r[   rX   str)r0   r"   out_dict
embeddingsr]   name_s         r   rL   zVitMatteConvStream.forwardu   si    ,l;!
s4:'' 	) 	)A&Az22J+c!a%jj8E(HUOOr   rM   rO   s   @r   rQ   rQ   ]   sV         Q Q Q Q Q&      r   rQ   c                   (     e Zd ZdZ fdZd Z xZS )VitMatteFusionBlockz\
    Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
    c                 z    t                                                       t          |||dd          | _        d S )Nr   )r=   r>   )r@   rA   r7   rB   )r0   r!   r:   r;   rG   s       r   rA   zVitMatteFusionBlock.__init__   s9    (lST^_```			r   c                     t           j                            |ddd          }t          j        ||gd          }|                     |          }|S )Nr8   bilinearF)scale_factormodealign_cornersr   )dim)r   
functionalinterpolater   catrB   )r0   featuresdetailed_feature_mapupscaled_featuresouts        r   rL   zVitMatteFusionBlock.forward   sR    M55hQU_ot5uui-/@AqIIIiinn
r   rM   rO   s   @r   rh   rh      sV         a a a a a      r   rh   c                   (     e Zd ZdZ fdZd Z xZS )VitMatteHeadzJ
    Simple Matting Head, containing only conv3x3 and conv1x1 layers.
    c                 B   t                                                       |j        d         }d}t          j        t          j        ||ddd          t          j        |          t          j        d          t          j        |dddd                    | _        d S )N   r   r   )r<   r=   r>   Tr   )	r@   rA   fusion_hidden_sizesr   
Sequentialr(   r)   rE   matting_convs)r0   r!   r:   mid_channelsrG   s       r   rA   zVitMatteHead.__init__   s    04]Ik<QqRSTTTN<((GDMMIlA1QJJJ	
 
r   c                 0    |                      |          }|S rI   )r~   rJ   s     r   rL   zVitMatteHead.forward   s    )),77r   rM   rO   s   @r   rx   rx      sQ         
 
 
 
 
      r   rx   c                   (     e Zd ZdZ fdZd Z xZS )VitMatteDetailCaptureModulezG
    Simple and lightweight Detail Capture Module for ViT Matting.
    c           
         t                                                       t          |j                  t          |j                  dz   k    rt          d          || _        t          |          | _        | j        j	        | _	        t          j                    | _        |j        g|j        z   | _        t          t          | j                  dz
            D ]W}| j                            t#          || j        |         | j	        |dz             z   | j        |dz                                 Xt%          |          | _        d S )Nr   z_The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1.)r!   r:   r;   )r@   rA   r[   r|   rV   
ValueErrorr!   rQ   
convstreamrY   r   rW   fusion_blockshidden_sizefusion_channelsrZ   r\   rh   rx   matting_head)r0   r!   r]   rG   s      r   rA   z$VitMatteDetailCaptureModule.__init__   s>   v)**c&2P.Q.QTU.UUUq   ,V44/4]__ & 23f6PPs4/001455 	 	A%%#! $ 4Q 7$/APQE(:S S!%!5a!e!<      )00r   c                 T   |                      |          }t          t          | j                            D ]I}dt	          t          | j                  |z
  dz
            z   } | j        |         |||                   }Jt          j        |                     |                    }|S )Nrb   r   )r   rZ   r[   r   rc   r   sigmoidr   )r0   rs   r"   detail_featuresr]   detailed_feature_map_namer   s          r   rL   z#VitMatteDetailCaptureModule.forward   s    //,77s4-..// 	c 	cA(?#c$J\F]F]`aFadeFeBfBf(f%,t)!,XG`7abbHHt00::;;r   rM   rO   s   @r   r   r      sQ         1 1 1 1 12      r   r   zX
    ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
    c                        e Zd Z fdZe	 	 	 	 	 d	deej                 dee         dee         deej                 dee         f
d            Z	 xZ
S )
VitMatteForImageMattingc                     t                                          |           || _        t          |          | _        t          |          | _        |                                  d S rI   )r@   rA   r!   r
   backboner   decoder	post_init)r0   r!   rG   s     r   rA   z VitMatteForImageMatting.__init__   sX       %f--26:: 	r   Nr"   output_attentionsoutput_hidden_stateslabelsreturn_dictc                 v   ||n| j         j        }||n| j         j        }||n| j         j        }d}|t	          d          | j                            |||          }|j        d         }|                     ||          }	|s|	f|dd         z   }
||f|
z   n|
S t          ||	|j
        |j                  S )a8  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth image matting for computing the loss.

        Examples:

        ```python
        >>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
        >>> import torch
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
        >>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")
        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
        ... )
        >>> trimap = Image.open(filepath).convert("L")

        >>> # prepare image + trimap for the model
        >>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

        >>> with torch.no_grad():
        ...     alphas = model(**inputs).alphas
        >>> print(alphas.shape)
        torch.Size([1, 1, 640, 960])
        ```NzTraining is not yet supported)r   r   rz   r   )r   r   r   r   )r!   use_return_dictr   r   NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsr   r   r   r   )r0   r"   r   r   r   r   r   outputsrs   r   outputs              r   rL   zVitMatteForImageMatting.forward   s	   R &1%<kk$+B]$8$D  $+Jj 	 2C1N--TXT_Tq%&EFFF-<</CWh = 
 
 '+h55 	FY,F)-)9TGf$$vE!!/)	
 
 
 	
r   )NNNNN)r   r   r   rA   r	   r   r   TensorboolrL   rN   rO   s   @r   r   r      s              04,0/3)-&*B
 B
u|,B
 $D>B
 'tn	B

 &B
 d^B
 B
 B
 ^B
 B
 B
 B
 B
r   r   )r   dataclassesr   typingr   r   r   modeling_utilsr   utilsr   r	   utils.backbone_utilsr
   configuration_vitmatter   r   r    r5   r7   rQ   rh   rx   r   r   __all__r   r   r   <module>r      sq     ! ! ! ! ! !              - - - - - - 0 0 0 0 0 0 0 0 1 1 1 1 1 1 2 2 2 2 2 2   
: : : : : : :  :$ 
) 
) 
) 
) 
)o 
) 
) 
)    29   4               F    ")   "    29   0& & & & &") & & &R   
N
 N
 N
 N
 N
5 N
 N
 
N
b %&?
@r   