
     `i+                         d dl Z d dlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
mZ  e            rd dlmZ 	 	 dd	e j        d
e j        dededef
dZ G d de          Z G d de	          Z	 	 	 	 ddZdS )    N   )center_to_corners_format)is_scipy_available   )HungarianMatcher	ImageLoss_set_aux_lossgeneralized_box_ioulinear_sum_assignment      ?inputstargets	num_boxesalphagammac                    |                                  }t          j                            | |d          }||z  d|z
  d|z
  z  z   }|d|z
  |z  z  }|dk    r||z  d|z
  d|z
  z  z   }	|	|z  }|                                |z  S )aR  
    Loss used in RetinaNet for dense detection: https://huggingface.co/papers/1708.02002.

    Args:
        inputs (`torch.FloatTensor` of arbitrary shape):
            The predictions for each example.
        targets (`torch.FloatTensor` with the same shape as `inputs`)
            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
            and 1 for the positive class).
        num_boxes (`int`):
            The total number of boxes in the batch.
        alpha (`float`, *optional*, defaults to 0.25):
            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
        gamma (`int`, *optional*, defaults to 2):
            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.

    Returns:
        Loss tensor
    none)	reductionr   r   )sigmoidnn
functional binary_cross_entropy_with_logitssum)
r   r   r   r   r   probce_lossp_tlossalpha_ts
             y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/loss/loss_grounding_dino.pysigmoid_focal_lossr!      s    4 >>Dm<<VWX^<__G
.AHW5
5Cq3w5()Dzz'/QY1w;$??~88::	!!    c                   >    e Zd Z ej                    d             ZdS )GroundingDinoHungarianMatcherc                    |d         j         dd         \  }}|d                             dd                                          }|d                             dd          }|d         }t          j        d t          ||          D                       }||                    d	d
          z  }t          j        d |D                       }d}	d}
d|	z
  ||
z  z  d|z
  dz                                    z  }|	d|z
  |
z  z  |dz                                    z  }||z
  |                                z  }t          j	        ||d          }t          t          |          t          |                     }| j        |z  | j        |z  z   | j        |z  z   }|                    ||d	                                          }d |D             }d t#          |                    |d	                    D             }d |D             S )a  
        Args:
            outputs (`dict`):
                A dictionary that contains at least these entries:
                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
                * "label_maps": Tuple of tensors of dim [num_classes, hidden_dim].
            targets (`list[dict]`):
                A list of targets (len(targets) = batch_size), where each target is a dict containing:
                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
                  ground-truth
                 objects in the target) containing the class labels
                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.

        Returns:
            `list[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
            - index_i is the indices of the selected predictions (in order)
            - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        logitsNr   r   r   
pred_boxes
label_mapsc                 0    g | ]\  }}||d                   S )class_labels ).0	label_maptargets      r    
<listcomp>z9GroundingDinoHungarianMatcher.forward.<locals>.<listcomp>b   s(    tttFWiQW	&*@ Atttr"   T)dimkeepdimc                     g | ]
}|d          S boxesr+   r,   vs     r    r/   z9GroundingDinoHungarianMatcher.forward.<locals>.<listcomp>g   s     = = =7 = = =r"   r          @g:0yE>)pc                 8    g | ]}t          |d                    S r4   lenr6   s     r    r/   z9GroundingDinoHungarianMatcher.forward.<locals>.<listcomp>{   s"    222QQwZ222r"   c                 >    g | ]\  }}t          ||                   S r+   r   )r,   ics      r    r/   z9GroundingDinoHungarianMatcher.forward.<locals>.<listcomp>|   s)    ccc41a(1..cccr"   c                     g | ]E\  }}t          j        |t           j                   t          j        |t           j                   fFS ))dtype)torch	as_tensorint64)r,   r>   js      r    r/   z9GroundingDinoHungarianMatcher.forward.<locals>.<listcomp>}   sH    ssscgcdfg%+666QVQ\8]8]8]^sssr"   )shapeflattenr   rB   catzipr   logtcdistr
   r   	bbox_cost
class_cost	giou_costviewcpu	enumeratesplit)selfoutputsr   
batch_sizenum_queriesout_probout_bboxr(   target_bboxr   r   neg_cost_classpos_cost_classrN   rM   rO   cost_matrixsizesindicess                      r    forwardz%GroundingDinoHungarianMatcher.forwardD   s/   , #*("3"9"1""=
K 8$,,Q22::<<<(00A66\*
 Ytt[^_ikr[s[stttuu
*..R."F"FF
 i = =W = = =>> e)%8a(lT>Q=V=V=X=X<XY1x<E"9:4?T?T?V?V>VW$~5G
 K+;;;	 ))A()K)KMefqMrMrsss	 ny04?Z3OORVR`clRll!&&z;CCGGII22'222cc9[EVEVW\^`EaEa;b;bcccsskrssssr"   N)__name__
__module____qualname__rB   no_gradr`   r+   r"   r    r$   r$   C   s:        U]__8t 8t _8t 8t 8tr"   r$   c                   $    e Zd ZdZd Zd Zd ZdS )GroundingDinoImageLossa  
    This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we
    compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
    matched ground-truth / prediction (supervise class and box).

    Args:
        matcher (`GroundingDinoHungarianMatcher`):
            Module able to compute a matching between targets and proposals.
        focal_alpha (`float`):
            Alpha parameter in focal loss.
        losses (`list[str]`):
            List of all the losses to be applied. See `get_loss` for a list of all available losses.
    c                 n    t           j                            |            || _        || _        || _        d S N)r   Module__init__matcherfocal_alphalosses)rT   rk   rl   rm   s       r    rj   zGroundingDinoImageLoss.__init__   s2    
	4   &r"   c           	         d         }t          j        fdt          t          ||                    D                       }t          j        d         d          }|                     |          }t          j        ||j        t           j                  }||                             t           j                  ||<   |S )z>
        Create one_hot based on the matching indices
        r&   c                     g | ]I\  }\  }\  }}|d k    r*|d         |         t          d         |                   z   n|d         |         JS )r   r*   r(   r;   )r,   r>   r.   _JrU   s        r    r/   zFGroundingDinoImageLoss._get_target_classes_one_hot.<locals>.<listcomp>   st       'A'A NOQRUU~&q)C0Ea0H,I,IIIX^_mXnopXq  r"   r(   r   )r1   )devicerA   )	rB   rH   rR   rI   _get_source_permutation_idx
zeros_likerr   longto)	rT   rU   r   r_   r&   r*   r(   idxtarget_classes_onehots	    `       r    _get_target_classes_one_hotz2GroundingDinoImageLoss._get_target_classes_one_hot   s     "y   +4S'5J5J+K+K  
 
 Yw|4!<<<
..w77 % 0UZU_ ` ` `%/%=%@%@%L%Lc"$$r"   c                 V   d|vrt          d          d|vrt          d          |                     |||          }|d         }|d         }t          j        ||          }t          j        ||          }|                                }t          |||| j        d          }d|i}	|	S )z
        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
        of dim [nb_target_boxes]
        r&   z#No logits were found in the outputs	text_maskz&No text_mask were found in the outputsr   )r   r   r   r   r   loss_ce)KeyErrorry   rB   masked_selectfloatr!   rl   )
rT   rU   r   r_   r   rx   source_logitsr{   r|   rm   s
             r    loss_labelsz"GroundingDinoImageLoss.loss_labels   s    
 7""@AAAg%%CDDD $ @ @'SZ [ [)K(	 +M9EE % 34I9 U U 5 ; ; = =$ )"
 
 
 W%r"   N)ra   rb   rc   __doc__rj   ry   r   r+   r"   r    rf   rf      sK           % % %(    r"   rf   c                    t          |j        |j        |j                  }g d}t	          ||j        |          }|                    |           i }| |d<   ||d<   ||d<   ||d<   d }|j        r$t          ||          }|D ]}||d<   ||d<   ||d<    |||          |j	        rF|	|
||d	} |||          }d
 |
                                D             }                    |           d|j        |j        d|j	        r3d 
                                D             }                    |           |j        rdi }t          |j        dz
            D ]5|                    fd
                                D                        6                    |           t!          fdD                       }||fS )N)rN   rM   rO   )labelsr5   cardinality)rk   rl   rm   r&   r'   r(   r{   auxiliary_outputs)r&   r'   r(   r{   c                      i | ]\  }}|d z   |S _encr+   r,   kr7   s      r    
<dictcomp>z7GroundingDinoForObjectDetectionLoss.<locals>.<dictcomp>   s"    QQQtq!QZQQQr"   r8   )r|   	loss_bbox	loss_giouc                      i | ]\  }}|d z   |S r   r+   r   s      r    r   z7GroundingDinoForObjectDetectionLoss.<locals>.<dictcomp>  s"    IIITQ1v:qIIIr"   r   c                 (    i | ]\  }}|d  z   |S )rp   r+   )r,   r   r7   r>   s      r    r   z7GroundingDinoForObjectDetectionLoss.<locals>.<dictcomp>  s)    #S#S#Stq!AAK#S#S#Sr"   c              3   B   K   | ]}|v |         |         z  V  d S rh   r+   )r,   r   	loss_dictweight_dicts     r    	<genexpr>z6GroundingDinoForObjectDetectionLoss.<locals>.<genexpr>  s:      TT1CSCSy|k!n,CSCSCSCSTTr"   )r$   rN   rM   rO   rf   rl   rv   auxiliary_lossr	   	two_stageitemsupdatebbox_loss_coefficientgiou_loss_coefficientrangedecoder_layersr   )r&   r   rr   r'   configr(   r{   outputs_classoutputs_coordencoder_logitsencoder_pred_boxesrk   rm   	criterionoutputs_lossr   
aux_outputencoder_outputs_lossencoder_loss_dictenc_weight_dictaux_weight_dictr   r>   r   r   s                         @@@r    #GroundingDinoForObjectDetectionLossr      sx    ,$0@FL\  G 0//F&&  I
 LLL#L!+L!+L )L >)-GG+ 	0 	0J'1J|$&/J{##,=()	,//I 	,$,$"	 
  
 &I&:FCCQQ7H7N7N7P7PQQQ*+++ 11 K  ,II[5F5F5H5HIII?+++ ,v,q011 	U 	UA""#S#S#S#S{?P?P?R?R#S#S#STTTT?+++TTTTTiTTTTTD---r"   )r   r   )NNNN)rB   torch.nnr   image_transformsr   utilsr   loss_for_object_detectionr   r   r	   r
   scipy.optimizer   Tensorintr   r!   r$   rf   r   r+   r"   r    <module>r      s|          7 7 7 7 7 7 & & & & & & f f f f f f f f f f f f  5444444 $" $"L$"\$" $" 	$"
 $" $" $" $"N:t :t :t :t :t$4 :t :t :tzF F F F FY F F Fb F. F. F. F. F. F.r"   