
    Pi6                        d dl Z d dlZd dlmZ d dlmZmZ d dlZd dl	Z	d dl
mZ d dlmZ d Zd Zde	j        fd	Zd
 Z G d d          Zddd e	j        d          fdZddd e	j        d          fdZdd e	j        d          fdZd Zd Zdeej        e	j        f         fdZdeej        e	j        f         deeeeef         fdZdS )    N)Thread)TupleUnion)Image)tqdmc                      t           j                                        rt           j                            d          j        dk     } t           j                            d          j        dk    }|st          j        dt          d           t          d t           j	        
                    d          d d         D                       }|d	k     r*t          j        d
t           j	         dt          d           |d	k     p| }nd} d}d}| ||fS )Nr         zSFlash Attention is disabled as it requires a GPU with Ampere (8.0) CUDA capability.   category
stacklevelc              3   4   K   | ]}t          |          V  d S N)int).0vs     s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/_models/sam2/utils/misc.py	<genexpr>z$get_sdpa_settings.<locals>.<genexpr>   s(      QQ1AQQQQQQ    .)r   r   zYou are using PyTorch zw without Flash Attention v2 support. Consider upgrading to PyTorch 2.2+ for Flash Attention v2 (which could be faster).TF)torchcudais_availableget_device_propertiesmajorwarningswarnUserWarningtuple__version__split)old_gpuuse_flash_attnpytorch_versionmath_kernel_ons       r   get_sdpa_settingsr'      s-   z   *22155;a?99!<<BaG 	Me$     QQ0A0G0G0L0LRaR0PQQQQQV##Me): e e e$	    )61G5GNN22r   c                     ddl m} |                    |                     t          j                                                            S )a  
    Get the connected components (8-connectivity) of binary masks of shape (N, 1, H, W).

    Inputs:
    - mask: A binary mask tensor of shape (N, 1, H, W), where 1 is foreground and 0 is
            background.

    Outputs:
    - labels: A tensor of shape (N, 1, H, W) containing the connected component labels
              for foreground pixels and 0 for background pixels.
    - counts: A tensor of shape (N, 1, H, W) containing the area of the connected
              components for foreground pixels and 0 for background pixels.
    r   )_C)sam2r)   get_connected_componnetstor   uint8
contiguous)maskr)   s     r   get_connected_componentsr0   0   sD     &&twwu{';';'F'F'H'HIIIr   masksc                    | j         \  }}}}| j        }t          j        ||t          j                  }t          j        ||t          j                  }t          j        ||d          \  }}	|d                             |d||          }|	d                             |d||          }	t          j        t          j        | ||          	                    d          d          \  }
}t          j
        t          j        | |d          	                    d          d          \  }}t          j        t          j        | |	|          	                    d          d          \  }}t          j
        t          j        | |	d          	                    d          d          \  }}t          j        |
|||fd          }|S )	z
    compute bounding box given an input mask

    Inputs:
    - masks: [B, 1, H, W] masks, dtype=torch.Tensor

    Returns:
    - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
    )devicedtypexy)indexing)NN.   dim)shaper3   r   arangeint32meshgridexpandminwhereflattenmaxstack)r1   B_hwr3   xsysgrid_xsgrid_ysmin_xsmax_xsmin_ysmax_ysbbox_coordss                  r   mask_to_boxrS   C   s    JAq!Q\F	aek	:	:	:B	aek	:	:	:B~b"t<<<GWo&--aAq99Go&--aAq99G	%+eWa88@@DD"MMMIFA	%+eWb99AA"EE2NNNIFA	%+eWa88@@DD"MMMIFA	%+eWb99AA"EE2NNNIFA+vvvv>BGGGKr   c                 |   t          j        |           }t          j        |                    d                              ||f                    }|j        t          j        k    r|dz  }nt          d|j         d|            t          j
        |                              ddd          }|j        \  }}|||fS )NRGB     o@zUnknown image dtype: z on r   r   r7   )r   opennparrayconvertresizer4   r-   RuntimeErrorr   
from_numpypermutesize)img_path
image_sizeimg_pilimg_npimgvideo_widthvideo_heights          r   _load_img_as_tensorrg   ]   s    j""GXgooe,,33Z4LMMNNF|rx%O6<OOXOOPPP

6
"
"
*
*1a
3
3C 'Kk))r   c                   $    e Zd ZdZd Zd Zd ZdS )AsyncVideoFrameLoaderzZ
    A list of video frames to be load asynchronously without blocking session start.
    c                 H    | _         | _        | _        | _        | _        d gt          |          z   _        d  _        d  _        d  _	        | _
                             d            fd}t          |d           _         j                                         d S )Nr   c                      	 t          t          t          j                            d          D ]}                     |            d S # t
          $ r}|_        Y d }~d S d }~ww xY w)Nframe loading (JPEG)desc)r   rangelenimages__getitem__	Exception	exception)neselfs     r   _load_framesz4AsyncVideoFrameLoader.__init__.<locals>._load_frames   s    #eC$4$455<RSSS ( (A$$Q''''( ( # # #!"#s   AA 
A)A$$A)T)targetdaemon)	img_pathsra   offload_video_to_cpuimg_meanimg_stdrp   rq   rt   rf   re   compute_devicerr   r   threadstart)rw   r{   ra   r|   r}   r~   r   rx   s   `       r   __init__zAsyncVideoFrameLoader.__init__n   s     #$$8! fs9~~- , 		# 	# 	# 	# 	# L>>>r   c                 F   | j         t          d          | j         | j        |         }||S t          | j        |         | j                  \  }}}|| _        || _        || j        z  }|| j	        z  }| j
        s|                    | j        d          }|| j        |<   |S )NzFailure in frame loading threadT)non_blocking)rt   r\   rq   rg   r{   ra   rf   re   r}   r~   r|   r,   r   )rw   indexrd   rf   re   s        r   rr   z!AsyncVideoFrameLoader.__getitem__   s    >%@AAt~Uk% ?J)<N5!4?*
 *
&\; )&t}t|( 	A&&,4&@@C E
r   c                 *    t          | j                  S r   )rp   rq   )rw   s    r   __len__zAsyncVideoFrameLoader.__len__   s    4;r   N)__name__
__module____qualname____doc__r   rr   r    r   r   ri   ri   i   sL         $ $ $L  *         r   ri   )g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?Fr   c           	      f   t          | t                    }t          | t                    }|o&t          j                            |           d         dv }	|s|	rt          | |||||          S |r5t          j                            |           rt          | ||||||          S t          d          )z
    Load the video frames from video_path. The frames are resized to image_size as in
    the model and are loaded to GPU if offload_video_to_cpu=False. This is used by the demo.
    r9   )z.mp4z.MP4)
video_pathra   r|   r}   r~   r   )r   ra   r|   r}   r~   async_loading_framesr   z;Only MP4 video and JPEG folder are supported at this moment)

isinstancebytesstrospathsplitext!load_video_frames_from_video_fileisdir!load_video_frames_from_jpg_imagesNotImplementedError)
r   ra   r|   r}   r~   r   r   is_bytesis_stris_mp4_paths
             r   load_video_framesr      s     *e,,H
C((FQRW--j99"=AQQK 
; 
0!!!5)
 
 
 	
 
 
BGMM*-- 
0!!!5!5)
 
 
 	
 "I
 
 	
r   c                    t          | t                    r"t          j                            |           r| nt          d          d t          j                  D             }|                    d            t          |          }|dk    rt          d           fd|D             }	t          j        |t          j                  d	d	d	d	f         }t          j        |t          j                  d	d	d	d	f         }|r#t          |	|||||          }
|
|
j        |
j        fS t          j        |d
||t          j                  }t#          t%          |	d                    D ]\  }}t'          ||          \  ||<   }}|s?|                    |          }|                    |          }|                    |          }||z  }||z  }|||fS )aX  
    Load the video frames from a directory of JPEG files ("<frame_index>.jpg" format).

    The frames are resized to image_size x image_size and are loaded to GPU if
    `offload_video_to_cpu` is `False` and to CPU if `offload_video_to_cpu` is `True`.

    You can load a frame asynchronously by setting `async_loading_frames` to `True`.
    ak  Only JPEG frames are supported at this moment. For video files, you may use ffmpeg (https://ffmpeg.org/) to extract frames into a folder of JPEG files, such as 
```
ffmpeg -i <your_video>.mp4 -q:v 2 -start_number 0 <output_dir>/'%05d.jpg'
```
where `-q:v` generates high-quality JPEG frames and `-start_number 0` asks ffmpeg to start the JPEG file from 00000.jpg.c                 `    g | ]+}t           j                            |          d          dv )|,S )r9   )z.jpgz.jpegz.JPGz.JPEG)r   r   r   )r   ps     r   
<listcomp>z5load_video_frames_from_jpg_images.<locals>.<listcomp>   sC       7Ar"&HHH 	
HHHr   c                 f    t          t          j                            |           d                   S )Nr   )r   r   r   r   )r   s    r   <lambda>z3load_video_frames_from_jpg_images.<locals>.<lambda>   s#    3rw'7'7':':1'=#>#> r   )keyr   zno images found in c                 P    g | ]"}t           j                            |          #S r   )r   r   join)r   
frame_name
jpg_folders     r   r   z5load_video_frames_from_jpg_images.<locals>.<listcomp>   s)    TTT*j*55TTTr   r4   N   rl   rm   )r   r   r   r   r   r   listdirsortrp   r\   r   tensorfloat32ri   rf   re   zeros	enumerater   rg   r,   )r   ra   r|   r}   r~   r   r   frame_names
num_framesr{   lazy_imagesrq   ru   r`   rf   re   r   s                   @r   r   r      s   " *c"" 
rw}}Z'@'@ 


!<
 
 	
 J''  K
 >>???[!!JQ===>>>TTTTTTTI|HEM:::111dD=IHl7%-888D$GG 	N+ 
 
 K4k6MMM[Q
JemTTTF i6L!M!M!MNN Y Y8/B8Z/X/X,q	< ->**;;~..**^,,
hF
gF<,,r   c                    ddl }t          j        |t          j                  ddddf         }t          j        |t          j                  ddddf         }|j                            d           |                    |                                           j        \  }}}	g }
|                    | ||          D ],}|
	                    |
                    ddd                     -t          j        |
d                                          d	z  }
|s?|
                    |          }
|                    |          }|                    |          }|
|z  }
|
|z  }
|
||fS )
z(Load the video frames from a video file.r   Nr   r   )widthheightr   r7   r:   rV   )decordr   r   r   bridge
set_bridgeVideoReadernextr<   appendr^   rE   floatr,   )r   ra   r|   r}   r~   r   r   rf   re   rG   rq   frames               r   r   r     se    MMM|HEM:::111dD=IHl7%-888D$GG
MW%%%#)#5#5j#A#A#F#F#H#H#N L+qF##Jj#TT . .emmAq!,,----[Q'''--//%7F ->**;;~..**^,,
hF
gF<,,r   c                    |dk    s
J d            | }	 t          | dk              \  }}|dk    ||k    z  }t          j        |d|           } n8# t          $ r+}t	          j        | dt          d           |} Y d}~nd}~ww xY w| S )zY
    A post processor to fill small holes in mask scores with area under `max_area`.
    r   zmax_area must be positiveg?a*  

Skipping the post-processing step due to the error above. You can still use SAM 2 and it's OK to ignore the error above, although some post-processing functionality may be limited (which doesn't affect the results in most cases; see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).r   r   N)r0   r   rB   rs   r   r   r   )r/   max_area
input_masklabelsareasis_holerv   s          r   fill_holes_in_mask_scoresr   9  s     a<<<4<<<J0;;A:%8"34{7C.. 
 
 
 N N N !	
 	
 	
 	
 
 Ks   9A 
B!A>>Bc                     | ||}}n<t          j        | d         |gd          }t          j        | d         |gd          }||dS )zDAdd new points and labels to previous point inputs (add at the end).Npoint_coordsr7   r:   point_labels)r   r   )r   cat)old_point_inputs
new_points
new_labelspointsr   s        r   concat_pointsr   V  s_    #Z,^<jIqQQQ,^<jIqQQQ"F;;;r   imagec                 *   t          | t          j                  r| j        d d         S t          | t          j                  r| j        \  }}}||fS t          | t                    r| j        \  }}||fS t          dt          |                      )Nr   z;Only support np.ndarray, torch.Tensoror PIL Image, but got )
r   rX   ndarrayr<   r   Tensorr   r_   r   type)r   rG   rH   rI   s       r   get_image_sizer   a  s    %$$ 
{2A2	E5<	(	( 	
+1a1v	E5	!	! 
z11v!W$u++WW
 
 	
r   crop_boxc                     |\  }}}}t          | t          j                  r| ||||d d f         S t          | t          j                  r| d d ||||f         S t          dt          |                      )NzAExpected image to be of type np.ndarray or torch.Tensor, but got )r   rX   r   r   r   
ValueErrorr   )r   r   x0y0x1y1s         r   
crop_imager   p  s     NBB%$$ 

RUBrE111_%%	E5<	(	( 
QQQ2r"u_%%3%)%[[3 3
 
 	
r   )r   r   	threadingr   typingr   r   numpyrX   r   PILr   r   r'   r0   r   rS   rg   ri   r3   r   r   r   r   r   r   r   r   r   r   r   r   <module>r      s   
			                                3 3 3<J J J&u|    4	* 	* 	*A  A  A  A  A  A  A  A P #!5<''&
 &
 &
 &
Z #!5<''@- @- @- @-N #!5<''- - - -@  :< < <
%
EL 89 
 
 
 

U\)*
6;Cc3<N6O
 
 
 
 
 
r   