
     `i^                     P   d dl Zd dlZd dlmZ d dlmZmZ d dlZ	d dl
Z
d dlmc mZ d dl
mZmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$  e            rd dl%m&Z&  e             rd dl'm(Z( d dl)m*Z* e ed           G d de                                  Z+	 dPde
j        de
j        de
j        fdZ,dededefdZ-de
j        de
j        de
j        fdZ. G d d ej/                  Z0deded!e1defd"Z2de
j        de
j        d!e1de
j        fd#Z3 G d$ d%ej/                  Z4 G d& d'ej/                  Z5 G d( d)ej/                  Z6	 dQd+ej/        d,e
j        d-e
j        d.e
j        d/ee
j                 d0e7d1e7fd2Z8 G d3 d4ej/                  Z9 G d5 d6ej/                  Z:dRd7e
j        d8e7d9e;de
j        fd:Z< G d; d<ej/                  Z= G d= d>ej/                  Z> G d? d@ej/                  Z? G dA dBe          Z@ G dC dDejA                  ZB G dE dFej/                  ZC G dG dHej/                  ZD G dI dJej/                  ZEe G dK dLe                      ZF edM           G dN dOeF                      ZGdLdOgZHdS )S    N)	dataclass)CallableOptional)Tensornn   )ACT2FN)ModelOutputis_scipy_availablerequires_backends)GradientCheckpointingLayer)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_accelerate_available)check_model_inputs   )
EomtConfig)linear_sum_assignment)PartialState)reducea  
    Class for outputs of [`EomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
    [`~EomtImageProcessor.post_process_instance_segmentation`] or
    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~EomtImageProcessor] for details regarding usage.
    )custom_introc                   4   e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed	<   dS )
"EomtForUniversalSegmentationOutputa*  
    loss (`torch.Tensor`, *optional*):
        The computed loss, returned when labels are present.
    class_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
        query. Note the `+ 1` is needed because we incorporate the null class.
    masks_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
        query.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Last hidden states (final feature map) of the last layer.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
    patch_offsets (`list[torch.Tensor]`, *optional*):
        list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
    Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionspatch_offsets)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r    r!   tupler"   r#   listr        z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/eomt/modeling_eomt.pyr   r   2   s          * )-D(5$
%,,,8<(5#45<<<8<(5#45<<<59x 129998<M8E%"345<<<59Ju01299926M8D./66666r.   r   Finput_featurespoint_coordinatesreturnc                     |                                 dk    rd}|                    d          }t          j        j        j        | d|z  dz
  fi |}|r|                    d          }|S )a(  
    A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.

    Args:
        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
            A tensor that contains features map on a height * width grid
        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
        2)):
            A tensor that contains [0, 1] * [0, 1] normalized point coordinates
        add_dim (`bool`):
            boolean value to keep track of added dimension

    Returns:
        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
        height_grid, width_grid):
            A tensor that contains features for points in `point_coordinates`.
    r   T   g       @      ?)dim	unsqueezer(   r   
functionalgrid_samplesqueeze)r0   r1   add_dimkwargspoint_featuress        r/   sample_pointr>   ]   s    ( !##-77:: X(4^SK\E\_bEbmmflmmN 3'//22r.   inputslabelsc                 (   |                                                      d          } dt          j        | |j                  z  }|                     d          dddf         |                    d          dddf         z   }d|dz   |dz   z  z
  }|S )a  
    A pair wise version of the dice loss, see `dice_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        `torch.Tensor`: The computed loss between each pairs.
    r   r4   N)sigmoidflattenr(   matmulTsum)r?   r@   	numeratordenominatorr   s        r/   pair_wise_dice_lossrJ   }   s     ^^%%a((FEL222I**R..D)FJJrNN47,CCK	A+/22DKr.   c                 F   | j         d         }t          j        d          } || t          j        |                     } || t          j        |                     }t          j        ||z  |j                  }t          j        ||z  d|z
  j                  }||z   }|S )a  
    A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss between each pairs.
    r   none	reduction)shaper   BCEWithLogitsLossr(   	ones_like
zeros_likerE   rF   )	r?   r@   height_and_width	criterioncross_entropy_loss_poscross_entropy_loss_negloss_posloss_negr   s	            r/   $pair_wise_sigmoid_cross_entropy_lossrY      s     |A$v666I&Yvuv/F/FGG&Yvu/?/G/GHH|25EEvxPPH|25EEF
~VVHhDKr.   c                        e Zd ZdZ	 ddedededef fdZ ej                    d	ej	        d
ej	        dej	        dej	        de
ee	                  f
d            Z xZS )EomtHungarianMatcheraq  This class computes an assignment between the labels and the predictions of the network.

    For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
    predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).
    r5    1  
cost_class	cost_mask	cost_dice
num_pointsc                     t                                                       |dk    r|dk    r|dk    rt          d          || _        || _        || _        || _        dS )aH  Creates the matcher

        Params:
            cost_class (`float`, *optional*, defaults to 1.0):
                Relative weight of the classification error in the matching cost.
            cost_mask (`float`, *optional*,  defaults to 1.0):
                This is the relative weight of the focal loss of the binary mask in the matching cost.
            cost_dice (`float`, *optional*, defaults to 1.0):
                This is the relative weight of the dice loss of the binary mask in the matching cost.
            num_points (`int`, *optional*, defaults to 12544):
                No. of points to sample on which the mask loss will be calculated. The same set of K points are
                uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
                matching.
        r   zAll costs can't be 0N)super__init__
ValueErrorr`   r]   r^   r_   )selfr]   r^   r_   r`   	__class__s        r/   rc   zEomtHungarianMatcher.__init__   sc    " 	??yA~~)q..3444$$""r.   r   r   mask_labelsclass_labelsr2   c                 H   g }|j         d         }t          |          D ]}||                             d          }||         }	|dd||         f          }
||                             |	          }|dddf         }|	dddf         }	t	          j        d| j        d|	j                  }|                    |j         d         dd          }t          ||d          
                    d          }|                    |	j         d         dd          }t          |	|d          
                    d          }	t          |	|          }t          |	|          }| j        |z  | j        |
z  z   | j        |z  z   }t	          j        |t	          j        d	                    }t	          j        |t	          j        d
                    }t	          j        |d          }t)          |                                          }|                    |           d |D             }|S )ao  
        Params:
            masks_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
            class_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
            class_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
                target) containing the class labels.
            mask_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes, height, width` containing the target masks.

        Returns:
            matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
            where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected labels (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
        r   rB   Nr   r4   deviceFalign_cornersg    _Bg    _c                     g | ]E\  }}t          j        |t           j                   t          j        |t           j                   fFS )dtype)r(   	as_tensorint64).0ijs      r/   
<listcomp>z0EomtHungarianMatcher.forward.<locals>.<listcomp>  sR     
 
 
_c_`bcU_Qek222EOAU[4Y4Y4YZ
 
 
r.   )rO   rangesoftmaxtor(   randr`   rk   repeatr>   r:   rY   rJ   r^   r]   r_   minimumtensormaximum
nan_to_numr   cpuappend)re   r   r   rg   rh   indices
batch_sizert   
pred_probs	pred_maskr]   target_maskr1   target_coordinatespred_coordinatesr^   r_   cost_matrixassigned_indicesmatched_indicess                       r/   forwardzEomtHungarianMatcher.forward   s6   8 *, */2
z"" 	- 	-A-a088<<J,Q/I %QQQQ%788J%a.++I66K%aaag.K!!!!T'*I !&
1doqIY Z Z Z!2!9!9+:KA:NPQST!U!U&{4FV[\\\ddefggK077	8JAqQQ$Y0@PUVVV^^_`aaI =YTTI+I{CCI.94t7SSVZVdgpVppK-U\$5G5GHHK-U\%5H5HIIK*;::K0EkooFWFW0X0XNN+,,,,
 
gn
 
 
 r.   )r5   r5   r5   r\   )r$   r%   r&   r'   floatintrc   r(   no_gradr   r,   r+   r   __classcell__rf   s   @r/   r[   r[      s          jo# ##27#JO#cf# # # # # #4 U]__D#lD $lD \	D
 lD 
eFm	D D D _D D D D Dr.   r[   	num_masksc                 *   |                                                      d          }d||z                      d          z  }|                    d          |                    d          z   }d|dz   |dz   z  z
  }|                                |z  }|S )a4  
    Compute the DICE loss, similar to generalized IOU for masks as follows:

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$

    In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).
        num_masks (`int`):
            The number of masks present in the current batch, used for normalization.

    Returns:
        `torch.Tensor`: The computed loss.
    r   r4   rB   )rC   rD   rG   )r?   r@   r   probsrH   rI   r   s          r/   	dice_lossr     s    , NN$$Q''EUV^((,,,I))B--&**R..0K	A+/22D88::	!DKr.   c                     t          j        d          } || |          }|                    d                                          |z  }|S )a|  
    Args:
        inputs (`torch.Tensor`):
            A float tensor of arbitrary shape.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss.
    rL   rM   r   )r   rP   meanrG   )r?   r@   r   rT   cross_entropy_lossr   s         r/   sigmoid_cross_entropy_lossr   7  sR     $v666I"6622""1%%))++i7DKr.   c                       e Zd Zdedeeef         f fdZdeee	                  dee	         fdZ
dee         deeef         fdZd	ed
ee         deej                 deeef         fdZdej        deej                 deej                 de	deeej        f         f
dZd Zd Zdej        dej        fdZdej        de	de	dedej        f
dZ	 ddej        d	ej        deej                 d
eej                 deeeej        f                  deeej        f         fdZd
ej        dej        dej        fdZ xZS )EomtLossconfigweight_dictc                    t                                                       t          | dg           |j        | _        || _        |j        | _        t          j        | j        dz             }| j        |d<   | 	                    d|           |j
        | _        |j        | _        |j        | _        t          |j        |j        |j        | j                  | _        dS )aH  
        The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
        compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
        of matched ground-truth / prediction (supervise class and mask)

        Args:
            config (`EomtConfig`):
                The configuration for Eomt model also containing loss calculation specific parameters.
            weight_dict (`dict[str, float]`):
                A dictionary of weights to be applied to the different losses.
        scipyr   rB   empty_weight)r]   r_   r^   r`   N)rb   rc   r   
num_labelsr   no_object_weighteos_coefr(   onesregister_buffertrain_num_pointsr`   oversample_ratioimportance_sample_ratior[   class_weightdice_weightmask_weightmatcher)re   r   r   r   rf   s       r/   rc   zEomtLoss.__init__L  s     	$	*** +& /z$/A"566=R^\::: !1 & 7'-'E$+*((	
 
 
r.   sizesr2   c                     |d         }|dd          D ]0}t          |          D ]\  }}t          ||         |          ||<   1|S )Nr   r   )	enumeratemax)re   r   maxessublistindexitems         r/   _max_by_axiszEomtLoss._max_by_axiso  s`    aQRRy 	7 	7G(11 7 7t"5<66e7r.   tensorsc                 "   |                      d |D                       }t          |          g|z   }|\  }}}}|d         j        }|d         j        }	t	          j        |||	          }
t	          j        |||ft          j        |	          }t          ||
|          D ]l\  }}}|d |j	        d         d |j	        d         d |j	        d         f         
                    |           d|d |j	        d         d |j	        d         f<   m|
|fS )Nc                 6    g | ]}t          |j                  S r-   )r,   rO   )rs   r}   s     r/   rv   z8EomtLoss._pad_images_to_max_in_batch.<locals>.<listcomp>y  s"    %O%O%OVd6<&8&8%O%O%Or.   r   rp   rk   r   r4   F)r   lenrp   rk   r(   zerosr   boolziprO   copy_)re   r   max_sizebatch_shaper   _heightwidthrp   rk   padded_tensorspadding_masksr}   padded_tensorpadding_masks                  r/   _pad_images_to_max_in_batchz$EomtLoss._pad_images_to_max_in_batchw  s1   $$%O%Ow%O%O%OPP7||nx/'2$
Avu
 "[fMMM
J#>ejY_```36wP]3^3^ 	G 	G/FM<+FLO+->v|A->@Q&,q/@QQRXXY_```AFL*6<?*,=fl1o,==>>},,r.   r   rh   r   c                    |}|j         \  }}}t          j        | j                  }|                     |          }	t          j        d t          ||          D                       }
t          j        ||f| j	        t
          j
        |j                  }|
||	<   |                    dd          } |||          }d|i}|S )a  Compute the losses related to the labels using cross entropy.

        Args:
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `batch_size, num_queries, num_labels`
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.

        Returns:
            `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
        )weightc                 *    g | ]\  }\  }}||         S r-   r-   )rs   targetr   ru   s       r/   rv   z(EomtLoss.loss_labels.<locals>.<listcomp>  s$    HHH>66AqVAYHHHr.   )
fill_valuerp   rk   r   r4   loss_cross_entropy)rO   r   CrossEntropyLossr   $_get_predictions_permutation_indicesr(   catr   fullr   rr   rk   	transpose)re   r   rh   r   pred_logitsr   num_queriesr   rT   idxtarget_classes_otarget_classespred_logits_transposedloss_celossess                  r/   loss_labelszEomtLoss.loss_labels  s    " +%0%6"
K't/@AAA	77@@ 9HHSw-G-GHHH
 
 %$/]h]o
 
 
 /s!,!6!6q!!<!<)2NCC&0r.   r   rg   r   c                 f                          |          }                     |          }||         }                     |          \  }}	||         }|dddf         }|dddf         }t          j                    5                       | fd j         j         j                  }
t          ||
d          
                    d          }ddd           n# 1 swxY w Y   t          ||
d          
                    d          }t          |||          t          |||          d}~~|S )a  Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.
            num_masks (`int)`:
                The number of masks, used for normalization.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
            - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
              masks.
        Nc                 .                         |           S N)calculate_uncertainty)logitsre   s    r/   <lambda>z%EomtLoss.loss_masks.<locals>.<lambda>  s    t99&AA r.   Frl   r   )	loss_mask	loss_dice)r    _get_targets_permutation_indicesr   r(   r   sample_points_using_uncertaintyr`   r   r   r>   r:   r   r   )re   r   rg   r   r   src_idxtgt_idx
pred_maskstarget_masksr   r1   point_labelspoint_logitsr   s   `             r/   
loss_maskszEomtLoss.loss_masks  s   4 ;;GDD77@@)'2
 ::;GGa#G,  4(
#AAAtG, ]__ 		i 		i $ D DAAAA%,! ! (6GW\]]]eefghhL		i 		i 		i 		i 		i 		i 		i 		i 		i 		i 		i 		i 		i 		i 		i $J0AQVWWW__`abb 4L,PYZZ"<yII
 

 s   ?ACC #C c                     t          j        d t          |          D                       }t          j        d |D                       }||fS )Nc                 D    g | ]\  }\  }}t          j        ||          S r-   r(   	full_like)rs   rt   srcr   s       r/   rv   zAEomtLoss._get_predictions_permutation_indices.<locals>.<listcomp>  s,    "a"a"a{q(35?3#:#:"a"a"ar.   c                     g | ]\  }}|S r-   r-   )rs   r   r   s      r/   rv   zAEomtLoss._get_predictions_permutation_indices.<locals>.<listcomp>  s    (E(E(E#q(E(E(Er.   r(   r   r   )re   r   batch_indicespredictions_indicess       r/   r   z-EomtLoss._get_predictions_permutation_indices  sT    	"a"aiX_N`N`"a"a"abb#i(E(EW(E(E(EFF111r.   c                     t          j        d t          |          D                       }t          j        d |D                       }||fS )Nc                 D    g | ]\  }\  }}t          j        ||          S r-   r   )rs   rt   r   tgts       r/   rv   z=EomtLoss._get_targets_permutation_indices.<locals>.<listcomp>  s,    "a"a"a{q(1c5?3#:#:"a"a"ar.   c                     g | ]\  }}|S r-   r-   )rs   r   r   s      r/   rv   z=EomtLoss._get_targets_permutation_indices.<locals>.<listcomp>  s    #@#@#@HQC#@#@#@r.   r   )re   r   r   target_indicess       r/   r   z)EomtLoss._get_targets_permutation_indices  sR    	"a"aiX_N`N`"a"a"abb#@#@#@#@#@AAn,,r.   r   c                 0    t          j        |           }|S )a  
        In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
        for the foreground class in `classes`.

        Args:
            logits (`torch.Tensor`):
            A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
            the number of foreground classes. The values are logits.

        Returns:
            scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
            uncertain locations having the highest uncertainty score.
        )r(   abs)re   r   uncertainty_scoress      r/   r   zEomtLoss.calculate_uncertainty  s      %y001!!r.   r`   r   r   c           	         |j         d         }t          ||z            }t          j        ||d|j                  }t          ||d          }	 ||	          }
t          ||z            }||z
  }t          j        |
dddddf         |d          d         }|t          j        |t          j        |j        	          z  }||dddf         z  }|	                    d
d          |	                    d
          ddf         	                    ||d          }|dk    r3t          j
        |t          j        ||d|j                  gd          }|S )a  
        This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
        uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
        prediction as input.

        Args:
            logits (`float`):
                Logit predictions for P points.
            uncertainty_function:
                A function that takes logit predictions for P points and returns their uncertainties.
            num_points (`int`):
                The number of points P to sample.
            oversample_ratio (`int`):
                Oversampling parameter.
            importance_sample_ratio (`float`):
                Ratio of points that are sampled via importance sampling.

        Returns:
            point_coordinates (`torch.Tensor`):
                Coordinates for P sampled points.
        r   r4   rj   Frl   Nr   )kr6   r   rB   r6   )rO   r   r(   rz   rk   r>   topkarangelongviewr   )re   r   uncertainty_functionr`   r   r   	num_boxesnum_points_sampledr1   r   point_uncertaintiesnum_uncertain_pointsnum_random_pointsr   shifts                  r/   r   z(EomtLoss.sample_points_using_uncertainty  s   < LO	 .>!>?? "Jy2DaPVP]^^^#F,=USSS22<@@"#:Z#GHH&)==j,QQQ111W59MSTUUUVWX"U\)5:V\Vc%d%d%dduQQQW~-222q99#((2,,/JOOPY[oqrssq   %	"EJy:KQW]Wd$e$e$ef! ! ! ! r.   Nauxiliary_predictionsc                    |                      ||||          }|                     ||d         j                  }i |                     ||||          |                     |||          }|rt          |          D ]b\  }	|	d         }|	d         }|                     ||||          }
fd|
                                D             }
|                    |
           c|S )a  
        This performs the loss computation.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, num_labels)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
                if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from
                the inner layers of the EomtMaskedAttentionDecoder.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
            - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
              masks.
            if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional
            losses for each auxiliary predictions.
        r   rj   Nr   r   c                 &    i | ]\  }}| d  |S )r   r-   )rs   keyvaluer   s      r/   
<dictcomp>z$EomtLoss.forward.<locals>.<dictcomp>n  s)    WWWzsE^^c^^UWWWr.   )	r   get_num_masksrk   r   r   r   r   itemsupdate)re   r   r   rg   rh   r  r   r   r   aux_outputs	loss_dictr   s              @r/   r   zEomtLoss.forward<  s   H ,,35I;Xdee&&|LO<R&SS	%
oo2K)TT%
3\7KK%

 !,$-.C$D$D ) ) ['23I'J$'23I'J$ LL)=?SU`bnoo	WWWWY__EVEVWWW	i((((r.   rk   c                 0   t          d |D                       }t          j        |t          j        |          }d}t	                      r2t
          j        i k    r"t          |          }t                      j        }t          j	        ||z  d          }|S )zk
        Computes the average number of target masks across the batch, for normalization purposes.
        c              3   4   K   | ]}t          |          V  d S r   )r   )rs   classess     r/   	<genexpr>z)EomtLoss.get_num_masks.<locals>.<genexpr>w  s(      AAGAAAAAAr.   r   r   )min)
rG   r(   rq   r   r   r   _shared_stater   num_processesclamp)re   rh   rk   r   
world_sizes        r/   r  zEomtLoss.get_num_maskss  s     AALAAAAA	OIU[PPP	
"$$ 	:)R//"9--	)^^9
K	J 6A>>>	r.   r   )r$   r%   r&   r   dictstrr   rc   r,   r   r   r   r+   r   nparrayr   r(   r   r   r   r   r   r   r   rk   r  r   r   s   @r/   r   r   K  s       !
z !
S%Z8H !
 !
 !
 !
 !
 !
F$tCy/ d3i    -4< -E&RX.DY - - - -" $* :>v, QVWYW_Q` 	c6k	       D<#l< %,'< rx	<
 < 
c5<	 < < < <|2 2 2- - -"EL "U\ " " " ""5!5! 	5!
 5! "'5! 
5! 5! 5! 5!z DH5 5#l5 $l5 %,'	5
 5<(5  (S%,->(?@5 
c5<	 5 5 5 5n%,  QVQ]        r.   r   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )EomtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   kernel_sizestride)rb   rc   
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)re   r   r+  r,  r-  r.  r3  rf   s          r/   rc   zEomtPatchEmbeddings.__init__  s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir.   pixel_valuesr2   c                     |j         d         }|| j        k    rt          d| j         d| d          |                     |                              d                              dd          }|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r4   )rO   r-  rd   r5  rD   r   )re   r6  r-  
embeddingss       r/   r   zEomtPatchEmbeddings.forward  s    #)!,4,,,I!.I I9EI I I   __\22::1==GG1MM
r.   )	r$   r%   r&   r'   rc   r(   r   r   r   r   s   @r/   r&  r&    sm         j j j j jEL U\        r.   r&  c                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )EomtEmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    r   r2   Nc                    t                                                       || _        |j        | _        t	          j        t          j        dd|j                            | _	        t	          j        t          j
        d|j        |j                            | _        t          |          | _        | j        j        }t	          j        |j                  | _        d|j        z   | _        t	          j        ||j                  | _        |                     dt          j        |                              d          d           d S )Nr   position_ids)r   rB   F)
persistent)rb   rc   r   r,  r   	Parameterr(   randnr.  	cls_tokenr   num_register_tokensregister_tokensr&  patch_embeddingsr3  Dropouthidden_dropout_probdropoutnum_prefix_tokens	Embeddingposition_embeddingsr   r  expand)re   r   r3  rf   s      r/   rc   zEomtEmbeddings.__init__  s    +ek!Q8J&K&KLL!|EK6;UW]Wi,j,jkk 3F ; ;+7z&"<==!"V%?!?#%<V=O#P#P ^U\+-F-F-M-Mg-V-Vchiiiiir.   r6  c                    |j         \  }}}}| j        j        j        j        }|                     |                    |                    }| j                            |dd          }| j                            |dd          }|| 	                    | j
                  z   }t          j        |||gd          }|                     |          }|S )Nro   rB   r   r  )rO   rD  r5  r   rp   ry   rA  rK  rC  rJ  r=  r(   r   rG  )re   r6  r   r   target_dtyper9  
cls_tokensrC  s           r/   r   zEomtEmbeddings.forward  s    *0
Aq!,7>D**<???+N+NOO
^**:r2>>
.55j"bII$":":4;L"M"MM
Y
OZHaPPP
\\*--
r.   )
r$   r%   r&   r'   r   rc   r(   r   r   r   r   s   @r/   r;  r;    s         jz jd j j j j j j EL U\        r.   r;          modulequeryr  r  attention_maskscalingrG  c                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )NrB   )r6   rp   )ptrainingr   r4   )r(   rE   r   r   r8   rx   float32ry   rp   rG  rW  
contiguous)
rP  rQ  r  r  rR  rS  rG  r<   attn_weightsattn_outputs
             r/   eager_attention_forwardr\    s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r.   c            
            e Zd ZdZ fdZ	 ddej        deej                 deej        eej                 f         fdZ	 xZ
S )	EomtAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)rb   rc   r   r.  	embed_dimnum_attention_heads	num_headshead_dimrd   scaleattention_dropoutrG  	is_causalr   Lineark_projv_projq_projout_projre   r   rf   s     r/   rc   zEomtAttention.__init__  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr.   Nr!   rR  r2   c           
         |j         \  }}}|                     |          }|                     |          }|                     |          }	|                    ||| j        | j                                      dd          }|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	t          }
| j	        j
        dk    rt          | j	        j
                 }
 |
| |||	|| j        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr   r4   eagerrO  )rf  rS  rG  )rO   rj  rh  ri  r  rb  rc  r   r\  r   _attn_implementationr   rf  rd  rW  rG  reshaperY  rk  )re   r!   rR  r<   r   
seq_lengthr`  querieskeysvaluesattention_interfacer[  rZ  s                r/   r   zEomtAttention.forward  sy    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00L((r.   r   )r$   r%   r&   r'   rc   r(   r   r   r+   r   r   r   s   @r/   r^  r^    s        GGB B B B B. 26$) $)|$) !.$)
 
u|Xel33	4$) $) $) $) $) $) $) $)r.   r^  c                   D     e Zd Zd fdZdej        dej        fdZ xZS )EomtLayerScaler2   Nc                     t                                                       t          j        |j        t          j        |j                  z            | _        d S r   )	rb   rc   r   r?  layerscale_valuer(   r   r.  lambda1rl  s     r/   rc   zEomtLayerScale.__init__  sC    |F$;ejI[>\>\$\]]r.   hidden_statec                     || j         z  S r   )rz  re   r{  s     r/   r   zEomtLayerScale.forward#  s    dl**r.   r2   Nr$   r%   r&   rc   r(   r   r   r   r   s   @r/   rw  rw    si        ^ ^ ^ ^ ^ ^+EL +U\ + + + + + + + +r.   rw  input	drop_probrW  c                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    rO  r   r   )r   r   )rO   ndimr(   rz   rp   rk   floor_div)r  r  rW  	keep_probrO   random_tensoroutputs          r/   	drop_pathr  '  s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr.   c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
EomtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr  r2   c                 V    t                                                       || _        d S r   )rb   rc   r  )re   r  rf   s     r/   rc   zEomtDropPath.__init__>  s$    "r.   r!   c                 8    t          || j        | j                  S r   )r  r  rW  re   r!   s     r/   r   zEomtDropPath.forwardB  s    FFFr.   c                     d| j          S )Nzp=)r  re   s    r/   
extra_reprzEomtDropPath.extra_reprE  s    $DN$$$r.   r   )r$   r%   r&   r'   r   r   rc   r(   r   r   r"  r  r   r   s   @r/   r  r  ;  s        bb# #(5/ #T # # # # # #GU\ Gel G G G G%C % % % % % % % %r.   r  c                   D     e Zd Zd fdZdej        dej        fdZ xZS )EomtMLPr2   Nc                 ~   t                                                       |j        x}}t          |j        |j        z            }t          j        ||d          | _        t          |j	        t                    rt          |j	                 | _        n|j	        | _        t          j        ||d          | _        d S )NTbias)rb   rc   r.  r   	mlp_ratior   rg  fc1r/  
hidden_actr"  r	   
activationfc2re   r   in_featuresout_featureshidden_featuresrf   s        r/   rc   zEomtMLP.__init__J  s    %+%77lf063CCDD9[/EEEf'-- 	0$V%67DOO$/DO9_lFFFr.   r{  c                     |                      |          }|                     |          }|                     |          }|S r   )r  r  r  r}  s     r/   r   zEomtMLP.forwardU  s;    xx--|44xx--r.   r~  r  r   s   @r/   r  r  I  si        	G 	G 	G 	G 	G 	GEL U\        r.   r  c                   D     e Zd Zd fdZdej        dej        fdZ xZS )EomtSwiGLUFFNr2   Nc                 D   t                                                       |j        x}}t          |j        |j        z            }t          |dz  dz            dz   dz  dz  }t          j        |d|z  d          | _        t          j        ||d          | _        d S )Nr4   r         Tr  )	rb   rc   r.  r   r  r   rg  
weights_inweights_outr  s        r/   rc   zEomtSwiGLUFFN.__init__]  s    %+%77lf063CCDD2Q677!;AAE)K_1D4PPP9_lNNNr.   r{  c                     |                      |          }|                    dd          \  }}t          j                            |          |z  }|                     |          S )Nr4   rB   r  )r  chunkr   r8   silur  )re   r{  x1x2hiddens        r/   r   zEomtSwiGLUFFN.forwardf  s]    |44##A2#..B##B''",'''r.   r~  r  r   s   @r/   r  r  \  si        O O O O O O(EL (U\ ( ( ( ( ( ( ( (r.   r  c                   n     e Zd ZdZdeddf fdZ	 d	dej        deej                 dej        fdZ	 xZ
S )
	EomtLayerzCThis corresponds to the Block class in the original implementation.r   r2   Nc                 "   t                                                       t          j        |j        |j                  | _        t          |          | _        t          |          | _
        |j        dk    rt          |j                  nt          j                    | _        t          j        |j        |j                  | _        |j        rt#          |          | _        nt'          |          | _        t          |          | _        d S )NepsrO  )rb   rc   r   	LayerNormr.  layer_norm_epsnorm1r^  	attentionrw  layer_scale1drop_path_rater  Identityr  norm2use_swiglu_ffnr  mlpr  layer_scale2rl  s     r/   rc   zEomtLayer.__init__p  s    \&"4&:OPPP
&v..*622@F@UX[@[@[f&;<<<acalanan\&"4&:OPPP
  	'$V,,DHHvDH*622r.   r!   	head_maskc                 j   |                      |          }|                     ||          \  }}|                     |          }|                     |          |z   }|                     |          }|                     |          }|                     |          }|                     |          |z   }|S r   )r  r  r  r  r  r  r  )re   r!   r  hidden_states_normself_attention_outputr   layer_outputs          r/   r   zEomtLayer.forward  s    
 "ZZ66#'>>2Di#P#P q $ 1 12G H H '<==M zz-00xx--((66 ~~l33mCr.   r   )r$   r%   r&   r'   r   rc   r(   r   r   r   r   r   s   @r/   r  r  m  s        MM3z 3d 3 3 3 3 3 3& -1 | EL) 
	       r.   r  c                   D     e Zd Zd fd	Zdej        dej        fdZ xZS )EomtLayerNorm2dư>Tc                 P    t                                          |||           d S )N)r  elementwise_affine)rb   rc   )re   r-  r  affinerf   s       r/   rc   zEomtLayerNorm2d.__init__  s(    36JJJJJr.   r{  r2   c                     |                     dddd          }t          j        || j        | j        | j        | j                  }|                     dddd          }|S )Nr   r4   r   r   )permuteF
layer_normnormalized_shaper   r  r  r}  s     r/   r   zEomtLayerNorm2d.forward  s^    #++Aq!Q77|L$2GVZV_aeaijj#++Aq!Q77r.   )r  Tr  r   s   @r/   r  r    si        K K K K K KEL U\        r.   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )EomtScaleLayerr   c                 $   t                                                       |j        }t          j        ||dd          | _        t          |j                 | _        t          j	        ||dd|d          | _
        t          |          | _        d S )Nr4   r(  r   r   F)r)  paddinggroupsr  )rb   rc   r.  r   ConvTranspose2dconv1r	   r  r  r4  conv2r  layernorm2dre   r   r.  rf   s      r/   rc   zEomtScaleLayer.__init__  s    ('[aXYZZZ
 !23Y
 
 

 +;77r.   r!   r2   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S r   )r  r  r  r  r  s     r/   r   zEomtScaleLayer.forward  sN    

=1166

=11((77r.   	r$   r%   r&   r   rc   r(   r   r   r   r   s   @r/   r  r    sj        8z 8 8 8 8 8 8 U\ el        r.   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )EomtScaleBlockr   c                     t                                                       j        | _        t	          j        fdt          | j                  D                       | _        d S )Nc                 .    g | ]}t                    S r-   )r  rs   r   r   s     r/   rv   z+EomtScaleBlock.__init__.<locals>.<listcomp>  s!    #[#[#[qN6$:$:#[#[#[r.   )rb   rc   num_upscale_blocks
num_blocksr   
ModuleListrw   blockrl  s    `r/   rc   zEomtScaleBlock.__init__  sX     3]#[#[#[#[E$/DZDZ#[#[#[\\


r.   r!   r2   c                 0    | j         D ]} ||          }|S r   )r  )re   r!   r  s      r/   r   zEomtScaleBlock.forward  s*    Z 	1 	1E!E-00MMr.   r  r   s   @r/   r  r    sq        ]z ] ] ] ] ] ]
U\ el        r.   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )EomtMaskHeadr   c                     t                                                       |j        }t          j        ||          | _        t          j        ||          | _        t          j        ||          | _        t          |j	                 | _
        d S r   )rb   rc   r.  r   rg  r  r  fc3r	   r  r  r  s      r/   rc   zEomtMaskHead.__init__  sm    (9[+669[+669[+66 !23r.   r!   r2   c                     |                      |                     |                    }|                      |                     |                    }|                     |          }|S r   )r  r  r  r  r  s     r/   r   zEomtMaskHead.forward  sS    (?(?@@(?(?@@//r.   r  r   s   @r/   r  r    sj        4z 4 4 4 4 4 4U\ el        r.   r  c                   X    e Zd ZU dZeed<   dZdZdZdgZ	dZ
eedZd	ej        d
dfdZdS )EomtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r   eomtr6  Fr  T)r!   r"   rP  r2   Nc                 ^   | j         j        }t          |t          j        t          j        t          j        f          rt          j                            |j	        t          j        d                     |j        ot          j                            |j	                  \  }}|dk    rdt          j        |          z  nd}t          j                            |j        | |           d S d S t          |t          j                  r?|j	        j                            d           |j        j                                         d S t          |t          j                  rU|j	        j                            dd           |j        +|j	        j        |j                                                  d S d S t          |t,                    r=t/          |d          r+|j        j                            | j         j                   d S d S t          |t4                    rt          j                            |j        j                            t<          j                  d|                              |j        j                   |j        _        |j!        j                                         d S d S )	N   )ar   r   r5   rO  )r   stdrz  )"r   initializer_ranger/  r   rg  r4  r  initkaiming_uniform_r   mathsqrtr  _calculate_fan_in_and_fan_outuniform_r  datafill_zero_rI  normal_padding_idxrw  hasattrrz  ry  r;  trunc_normal_rA  ry   r(   rX  rp   rC  )re   rP  r  fan_inr   bounds         r/   _init_weightsz!EomtPreTrainedModel._init_weights  s[   k+fry")R5GHII 	0G$$V]dill$CCC{&GAA&-PP	17!DIf----  ufe<<<<< '& -- 	0M$$S)))K""$$$$$-- 	0M&&CQ&777!-"6#56<<>>>>> .-// 	0vy)) H#))$+*FGGGGGH H// 	0$&G$9$9 %((77cs %: % %b!'(( ! "'--/////		0 	0r.   )r$   r%   r&   r'   r   r*   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpar  r^  _can_record_outputsr   Moduler   r-   r.   r/   r  r    s          
 $O&+#$N"# 
0BI 0$ 0 0 0 0 0 0r.   r  zV
    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
    c                   V    e Zd ZdZdef fdZdededededeeef         d	eeef         fd
Z	deeef         d	efdZ
ee	 	 	 ddedeee                  deee                  deee                  dee         d	efd                        Zd Zdej        fdZed             Z xZS )EomtForUniversalSegmentationr6  r   c                 j   t                                                     | _        j        | _        t	                    | _        t          j        j        j	                  | _
        t          j        j        j                  | _        t          j        fdt          j                  D                       | _        t#                    | _        t'                    | _        t          j        j        j        dz             | _        j        j        z  j        j        z  f| _        j        j        j        d| _        t?          | j                  | _         | !                    dtE          j#        j$                             | %                                 d S )Nr  c                 .    g | ]}t                    S r-   )r  r  s     r/   rv   z9EomtForUniversalSegmentation.__init__.<locals>.<listcomp>  s!    $`$`$`1Yv%6%6$`$`$`r.   r   )r   r   r   )r   r   attn_mask_probs)&rb   rc   r   num_hidden_layersr;  r9  r   r  r.  r  	layernormrI  r   rQ  r  rw   layersr  upscale_blockr  	mask_headrg  r   class_predictorr+  r,  	grid_sizer   r   r   r   r   rT   r   r(   r   r  	post_initrl  s    `r/   rc   z%EomtForUniversalSegmentation.__init__  sy      !'!9(00f&8f>STTT\&"4f6HII
m$`$`$`$`fF^@_@_$`$`$`aa+F33%f--!y);V=NQR=RSS +v/@@&BSW]WhBhi"("5++.
 .
 "T=MNNN.
6;L0M0MNNNr.   r   r   rg   rh   r  r2   c                     |                      |||||          }| j                                        D ](\  }}|                                D ]\  }	}
||	v r|
|z  }
)|S )Nr   r   rg   rh   r  )rT   r   r  )re   r   r   rg   rh   r  r  r  r   loss_keyr   s              r/   get_loss_dictz*EomtForUniversalSegmentation.get_loss_dict'  s     (,~~!5!5#%"7 (6 (
 (
	  +1133 	# 	#KC"+//"3"3 # #$(??FND# r.   r  c                 D    t          |                                          S r   )rG   rt  )re   r  s     r/   get_lossz%EomtForUniversalSegmentation.get_loss?  s    9##%%&&&r.   Nr#   r<   c                    d\  }}d}|t          d          |                     |          }	t          | j                  D ]w\  }
}|
| j        | j        j        z
  k    ri| j        j        dddddf         	                    |	j
        d         dd                              |	j                  }t          j        ||	fd          }	|
| j        | j        j        z
  k    r| j        s'| j        |
| j        z
  | j        j        z            dk    r|                     |	          }|                     |          \  }}||fz  }||fz  }t          j        |	j
        d         |	j
        d         |	j
        d         |	j        t          j                  }t+          j        || j        d	
          }|                    |                    d          |                    d          d          }| j        j        }|| j        j        z   }|dk    |ddd||df<   |                     || j        |
| j        z
  | j        j        z            |||j                  }|ddddf         	                    d| j        j        dd          }|                                                    | d          } ||	|          }	y|                     |	          }|                     |          \  }}||fz  }||fz  }d}|L|Jd}tA          ||          D ]7\  }}| !                    ||||d          }|| "                    |          z  }8tG          |||||          S )ag  
        mask_labels (`list[torch.Tensor]`, *optional*):
            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
        class_labels (`list[torch.LongTensor]`, *optional*):
            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
        patch_offsets (`list[torch.Tensor]`, *optional*):
            list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
        )r-   r-   Nz You have to specify pixel_valuesr   rB   r   r  )rk   rp   bilinear)sizemode)probnum_query_tokensencoder_start_tokensrk   .g    erO  r  )r   r   r   r    r#   )$rd   r9  r   r  r  r   r  rQ  r   rK  rO   ry   rk   r(   r   rW  r  r  predictr   r   r  interpolater  r  r  r   rH  _disable_attention_maskra  r   masked_fillr   r  r  r   )re   r6  rg   rh   r#   r<   masks_queries_logits_per_layerclass_queries_logits_per_layerrR  r!   r   layer_modulerQ  norm_hidden_statesr   r   interpolated_logitsr   r!  sequence_outputr   r  s                         r/   r   z$EomtForUniversalSegmentation.forwardB  s   ( JPF&(F?@@@55!*4;!7!7 .	H .	HCd,t{/EEEE
)$111*5<<]=PQR=SUWY[\\__`m`tuu %	5-*@a H H Hd,t{/EEEE F!%!5cD<R6RUYU`Uk6k!lop!p!p%)^^M%B%B"=A\\J\=]=]:$&:.3G2II..3G2II.!&!'*!'*!'*(/*" " " '(m4Ht~dn&o&o&o#&9&>&>',,Q//1D1I1I!1L1Lb' '# $(;#: '7$/:['[$ ObdeNeqqq"3#3"35I5J5JJK "&!=!="-cD4J.JT[Mc.cd%5)=)0 "> " " "04!=!D!DRIhjlnp!q!q!/!5!5!7!7!C!C^OUY!Z!Z(LGGMM..7759\\/5R5R22&+?*AA&&+?*AA&"|'?D>A.0N? ? 
1 
1:$&: !..)=)= +!-*. /  	 i0001!5!5-'
 
 
 	
r.   c                     | j         j        S r   )r9  rD  r  s    r/   get_input_embeddingsz1EomtForUniversalSegmentation.get_input_embeddings  s    //r.   r   c                    |d d d | j         j        d d f         }|                     |          }|d d | j         j        | j        j        z   d d d f         }|                    dd          } |j        |j        d         dg| j        R  }| 	                    |          }| 
                    |          }t          j        d||          }||fS )Nr   r4   r   rB   zbqc, bchw -> bqhw)r   r   r  r9  rH  r   rp  rO   r  r  r  r(   einsum)re   r   query_tokensclass_logitsprefix_tokensmask_logitss         r/   r"  z$EomtForUniversalSegmentation.predict  s    aaa!:4;#:!:AAA=>++L99qqq$+"9DO<]"]"_"_abababbc%//155--m.A!.DbZ4>ZZZ~~l33**=99l#6mTTL((r.   c                     |dk     r:t          j        | j        d         ||          |k    }d| d d d ||d f         |<   | S )Nr   r   rj   )r(   rz   rO   )	attn_maskr  r   r!  rk   random_queriess         r/   r$  z4EomtForUniversalSegmentation._disable_attention_mask  sb    !88"Z	(:<LU[\\\_ccN VWIaaa***,@,A,AAB>Rr.   )NNN)r$   r%   r&   r  r   rc   r   r!  r"  r  r  r   r   r   r,   r   r   r   r   r-  r(   r"  staticmethodr$  r   r   s   @r/   r	  r	    s        %Oz      8$ % 	
   $CK0 
c6k	   0'$sF{"3 ' ' ' ' '  /3/304e
 e
e
 d6l+e
 tF|,	e

  V-e
 +,e
 
,e
 e
 e
 ^ e
N0 0 0)el ) ) ) )    \    r.   r	  )F)rO  )rO  F)Icollections.abcr0  r  dataclassesr   typingr   r   numpyr#  r(   torch.nn.functionalr   r8   r  r   activationsr	   
file_utilsr
   r   r   modeling_layersr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   configuration_eomtr   scipy.optimizer   
accelerater   accelerate.utilsr   r   r>   rJ   rY   r  r[   r   r   r   r   r&  r;  r   r\  r^  rw  r   r  r  r  r  r  r  r  r  r  r  r  r	  __all__r-   r.   r/   <module>rI     s  ,      ! ! ! ! ! ! % % % % % % % %                       ! ! ! ! ! ! L L L L L L L L L L 9 9 9 9 9 9 F F F F F F F F & & & & & & P P P P P P P P P P / / / / / / * * * * * *  5444444 ('''''''''''' 	 	 	7 7 7 7 7 7 7	 	 7B LQ L5:\
\   @  6    , u| X]Xd    8g g g g g29 g g gTf f      <u| U\ VY ^c^j    (u u u u ury u u up	    ")   B" " " " "RY " " "X % %I%<% 
% <	%
 U\*% % % % % %.;) ;) ;) ;) ;)BI ;) ;) ;)|+ + + + +RY + + + U\ e T V[Vb    (% % % % %29 % % %    bi   &( ( ( ( (BI ( ( ("' ' ' ' '* ' ' 'T    bl       RY   2	 	 	 	 	RY 	 	 	    29   " '0 '0 '0 '0 '0/ '0 '0 '0T   
    #6   
D !"@
Ar.   