
     `i)                         d dl mZmZmZmZ ddlmZmZmZm	Z	m
Z
 ddlmZmZ  e            rd dlmZ ddlmZmZ  e            rd dlZd d	lmZ dd
lmZ  e	j        e          Z e ed                     G d de                      ZdS )    )AnyOptionalUnionoverload   )add_end_docstringsis_torch_availableis_vision_availableloggingrequires_backends   )ChunkPipelinebuild_pipeline_init_args)Image)
load_imagevalid_imagesN)BaseModelOutput)2MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMEST)has_image_processorc                       e Zd ZdZdZdZdZdZ fdZe	de
edf         de
eee         f         ded	eeeef                  fd
            Ze	deeeef                  ded	eeeeef                           fd            Z	 dde
edeeeef                  f         dee
eee         f                  ded	e
eeeef                  eeeeef                           f         f fdZd ZddZd ZddZddd	eeef         fdZ xZS )ZeroShotObjectDetectionPipelinea  
    Zero shot object detection pipeline using `OwlViTForObjectDetection`. This pipeline predicts bounding boxes of
    objects when you provide an image and a set of `candidate_labels`.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
    >>> detector(
    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
    ...     candidate_labels=["cat", "couch"],
    ... )
    [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]

    >>> detector(
    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    ...     candidate_labels=["head", "bird"],
    ... )
    [{'score': 0.119, 'label': 'bird', 'box': {'xmin': 71, 'ymin': 170, 'xmax': 410, 'ymax': 508}}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-object-detection"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-object-detection).
    FTc                      t                      j        di | | j        dk    rt          d| j         d          t          | d           |                     t                     d S )NtfzThe z is only available in PyTorch.vision )super__init__	framework
ValueError	__class__r   check_model_typer   )selfkwargsr    s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/pipelines/zero_shot_object_detection.pyr   z(ZeroShotObjectDetectionPipeline.__init__=   st    ""6""">T!!RDNRRRSSS$)))PQQQQQ    imagezImage.Imagecandidate_labelsr#   returnc                     d S Nr   )r"   r&   r'   r#   s       r$   __call__z(ZeroShotObjectDetectionPipeline.__call__F   s	      #sr%   c                     d S r*   r   )r"   r&   r#   s      r$   r+   z(ZeroShotObjectDetectionPipeline.__call__K   s    beber%   Nc           	         d|v r|                     d          }t          |t          t          j        f          r||d}nmt          |t          t
          f          rNt          |          r?t	           t                      j        d t          ||          D             fi |          S 	 |} t                      j        |fi |}|S )a|  
        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.

        Args:
            image (`str`, `PIL.Image` or `list[dict[str, Any]]`):
                The pipeline handles three types of images:

                - A string containing an http url pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                You can use this parameter to send directly a list of images, or a dataset or a generator like so:

                ```python
                >>> from transformers import pipeline

                >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
                >>> detector(
                ...     [
                ...         {
                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
                ...             "candidate_labels": ["cat", "couch"],
                ...         },
                ...         {
                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
                ...             "candidate_labels": ["cat", "couch"],
                ...         },
                ...     ]
                ... )
                [[{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.25, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}], [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]]
                ```


            candidate_labels (`str` or `list[str]` or `list[list[str]]`):
                What the model should recognize in the image.

            threshold (`float`, *optional*, defaults to 0.1):
                The probability necessary to make a prediction.

            top_k (`int`, *optional*, defaults to None):
                The number of top predictions that will be returned by the pipeline. If the provided number is `None`
                or higher than the number of predictions available, it will default to the number of predictions.

            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.


        Return:
            A list of lists containing prediction results, one list per input image. Each list contains dictionaries
            with the following keys:

            - **label** (`str`) -- Text query corresponding to the found object.
            - **score** (`float`) -- Score corresponding to the object (between 0 and 1).
            - **box** (`dict[str,int]`) -- Bounding box of the detected object in image's original size. It is a
              dictionary with `x_min`, `x_max`, `y_min`, `y_max` keys.
        text_queriesr&   r'   c              3   &   K   | ]\  }}||d V  dS )r/   Nr   ).0imglabelss      r$   	<genexpr>z;ZeroShotObjectDetectionPipeline.__call__.<locals>.<genexpr>   s.      ppKCs??ppppppr%   )
pop
isinstancestrr   listtupler   r   r+   zip)r"   r&   r'   r#   inputsresultsr    s         r$   r+   z(ZeroShotObjectDetectionPipeline.__call__N   s    ~ V##%zz.99ec5;/00 	$:JKKFFe}-- 	,u2E2E 	  ppSVW\^nSoSoppp     F"%''"644V44r%   c                 n    i }d|v r|d         |d<   i }d|v r|d         |d<   d|v r|d         |d<   |i |fS )Ntimeout	thresholdtop_kr   )r"   r#   preprocess_paramspostprocess_paramss       r$   _sanitize_parametersz4ZeroShotObjectDetectionPipeline._sanitize_parameters   sl    +1)+<i(&  .4[.A{+f*0/w' "&888r%   c              #     K   t          |d         |          }|d         }t          |t                    r|                    d          }t	          j        |j        |j        ggt          j                  }t          |          D ]\  }}| 
                    || j                  }|                     || j                  }	| j        dk    r|	                    | j                  }	|t          |          dz
  k    ||d	||	V  d S )
Nr&   )r>   r'   ,)dtype)return_tensorsptr   )is_lasttarget_sizecandidate_label)r   r6   r7   splittorchtensorheightwidthint32	enumerate	tokenizerr   image_processortorF   len)
r"   r;   r>   r&   r'   rJ   irK   text_inputsimage_featuress
             r$   
preprocessz*ZeroShotObjectDetectionPipeline.preprocess   s9     6'?G<<<!"45&,, 	;/55c::lU\5;$?#@TTT"+,<"="= 	 	A...XXK!11%1WWN~%%!/!2!24:!>!>$4 5 5 99*#2  	
 !    	 	r%   c                     |                     d          }|                     d          }|                     d          } | j        di |}|||d|}|S )NrJ   rK   rI   )rJ   rK   rI   r   )r5   model)r"   model_inputsrJ   rK   rI   outputsmodel_outputss          r$   _forwardz(ZeroShotObjectDetectionPipeline._forward   so    "&&}55&**+<==""9--$*,,|,,(3dkwwovwr%   皙?c                    g }|D ]}|d         }t          |          }| j                            |||d                   d         }|d                                         D ]d}|d         |                                         }	|                     |d         |         d                   }
|	||
d}|                    |           et          |d d	
          }|r
|d |         }|S )NrK   rJ   )r^   r?   target_sizesr   scoresboxes)scorelabelboxc                     | d         S )Nrf   r   )xs    r$   <lambda>z=ZeroShotObjectDetectionPipeline.postprocess.<locals>.<lambda>   s
    '
 r%   T)keyreverse)r   rT   post_process_object_detectionnonzeroitem_get_bounding_boxappendsorted)r"   r_   r?   r@   r<   model_outputrg   r^   indexrf   rh   results               r$   postprocessz+ZeroShotObjectDetectionPipeline.postprocess   s   ) 	' 	'L !23E*<88L*HH$	UbHc I  G !*2244 ' ')%05577,,WW-=e-DQ-GHH#(5EEv&&&&' &:&:DIII 	&fufoGr%   rh   ztorch.Tensorc                     | j         dk    rt          d          |                                                                \  }}}}||||d}|S )a%  
        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }

        Args:
            box (`torch.Tensor`): Tensor containing the coordinates in corners format.

        Returns:
            bbox (`dict[str, int]`): Dict containing the coordinates in corners format.
        rH   zAThe ZeroShotObjectDetectionPipeline is only available in PyTorch.)xminyminxmaxymax)r   r   inttolist)r"   rh   ry   rz   r{   r|   bboxs          r$   rq   z1ZeroShotObjectDetectionPipeline._get_bounding_box   sa     >T!!`aaa!$!1!1!3!3dD$	
 
 r%   r*   )ra   N)__name__
__module____qualname____doc___load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   r   r   r7   r8   r   dictr+   r   rC   rZ   r`   rw   r}   rq   __classcell__)r    s   @r$   r   r      s/        @ O #OR R R R R #3-.#BGTRUYBW#cf#	d38n	# # # X# ed4S>2eced4PTUXZ]U]P^K_F`eee Xe
 =AV VS-d38n)==>V #5d3i#89V 	V
 
tDcN#T$tCH~*>%??	@V V V V V Vp	9 	9 	9   (     ,^ S#X        r%   r   )typingr   r   r   r   utilsr   r	   r
   r   r   baser   r   PILr   image_utilsr   r   rM   transformers.modeling_outputsr   models.auto.modeling_autor   
get_loggerr   loggerr   r   r%   r$   <module>r      s_   1 1 1 1 1 1 1 1 1 1 1 1 k k k k k k k k k k k k k k 9 9 9 9 9 9 9 9  766666666 _LLL======^^^^^^		H	%	% ,,FFFGGa a a a am a a HGa a ar%   