
     `iWM                        d Z ddlZddlZddlmZmZ ddlZddlm	Z	 ddl
mZmZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZ dd	lmZmZmZmZ dd
lm Z   e            rddl!Z!ddl"m#Z#m$Z$m%Z%  e            rddl&Z& ej'        e(          Z)dZ*d Z+	 	 	 	 	 	 	 	 	 d$de,de-de,de,de-de-de-de-dee.         dee,         de#j#        fdZ/	 d%dej0        de,d eee,e1f                  fd!Z2 G d" d#e          Z3d#gZ4dS )&z%Image processor class for Pix2Struct.    N)OptionalUnion)hf_hub_download   )BaseImageProcessorBatchFeature)convert_to_rgb	normalizeto_channel_dimension_formatto_pil_image)ChannelDimension
ImageInputget_image_sizeinfer_channel_dimension_formatmake_flat_list_of_imagesto_numpy_arrayvalid_images)
TensorTypeis_torch_availableis_vision_availablelogging)requires_backends)Image	ImageDraw	ImageFontzybelkada/fontsc                 <   t          t          dg           |                     d          } t          j        j                            | ||f||f          }|                    |                     d          |                     d          ||d          }|	                    ddddd                              |                     d          |z  |                     d          |z  |                     d          |z  |z            }|                    d          S )	a  
    Utility function to extract patches from a given image tensor. Returns a tensor of shape
    (1, `rows`, `columns`, `num_channels`x `patch_height` x `patch_width`).

    Args:
        image_tensor (torch.Tensor):
            The image tensor to extract patches from.
        patch_height (int):
            The height of the patches to extract.
        patch_width (int):
            The width of the patches to extract.
    torchr   )stride         r   )
r   torch_extract_patches	unsqueezer   nn
functionalunfoldreshapesizepermute)image_tensorpatch_heightpatch_widthpatchess       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/pix2struct/image_processing_pix2struct.pyr#   r#   4   s    +gY777))!,,Lh!((k7R\hju[v(wwGool//22L4E4Ea4H4H,XceghhGooaAq!,,44!,!+!|+k9 G
 Q    $   blackwhite   text	text_size
text_colorbackground_colorleft_paddingright_paddingtop_paddingbottom_padding
font_bytes	font_pathreturnc
                 z   t          t          d           t          j        d          }
|
                    |           }d                    |          }||	t          j        |          }n|	|	}nt          t          d          }t          j        |d|	          }t          j        t          j        d
d|                    }|                    d||          \  }}}}||z   |z   }||z   |z   }t          j        d
||f|          }t          j        |          }|                    ||f|||           |S )a  
    Render text. This script is entirely adapted from the original script that can be found here:
    https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py

    Args:
        text (`str`, *optional*, defaults to ):
            Text to render.
        text_size (`int`, *optional*, defaults to 36):
            Size of the text.
        text_color (`str`, *optional*, defaults to `"black"`):
            Color of the text.
        background_color (`str`, *optional*, defaults to `"white"`):
            Color of the background.
        left_padding (`int`, *optional*, defaults to 5):
            Padding on the left.
        right_padding (`int`, *optional*, defaults to 5):
            Padding on the right.
        top_padding (`int`, *optional*, defaults to 5):
            Padding on the top.
        bottom_padding (`int`, *optional*, defaults to 5):
            Padding on the bottom.
        font_bytes (`bytes`, *optional*):
            Bytes of the font to use. If `None`, the default font will be used.
        font_path (`str`, *optional*):
            Path to the font to use. If `None`, the default font will be used.
    visionP   )width)r5   
Nz	Arial.TTFzUTF-8)encodingr)   RGB)r   r   r   r   )xyr5   fillfont)r   render_texttextwrapTextWrapperwrapjoinioBytesIOr   DEFAULT_FONT_PATHr   truetyper   Drawr   newtextbboxr5   )r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   wrapperlineswrapped_textrJ   	temp_draw_
text_widthtext_heightimage_widthimage_heightimagedraws                         r/   rK   rK   O   sK   L k8,,, ",,,GLLdL##E99U##L)"3z*%%		0+>>dW9EEED uy8HIIJJI$-$6$6v|T$R$R!Aq*k |+m;K,~=LIek<8:JKKE>%  DII,,<jW[I\\\Lr0   r`   headerinput_data_formatc                    t          t          d           t          | |          } t          |fi |}t	          |j        | j                  }t          | j        || j        z  z            }t          |j        ||j        z  z            }t          j	        d|||z   fd          }|
                    |                    ||f          d           |
                    |                     ||f          d|f           t          |          }t          |          t          j        k    rt!          |t          j                  }|S )a  
    Renders the input text as a header on the input image.

    Args:
        image (`np.ndarray`):
            The image to render the header on.
        header (`str`):
            The header text.
        data_format (`Union[ChannelDimension, str]`, *optional*):
            The data format of the image. Can be either "ChannelDimension.channels_first" or
            "ChannelDimension.channels_last".

    Returns:
        `np.ndarray`: The image with the header rendered.
    rA   )rc   rF   r3   rG   r   )r   render_headerr   rK   maxrC   intheightr   rU   pasteresizer   r   r   LASTr   )	r`   rb   rc   kwargsheader_image	new_width
new_heightnew_header_height	new_images	            r/   re   re      s;   $ mX... 2CDDDEv0000LL&44IU\Y%<=>>JL/9|?Q3QRSS	%)Z:K-K!LgVVIOOL''4E(FGGPPPOOELL)Z!899A?P;QRRR y))I%i004D4III/	;K;PQQ	r0   c                       e Zd ZdZddgZ	 	 	 	 	 dded	ed
eeee	f                  de	deddf fdZ
	 ddej        de	d
edeeeef                  dej        f
dZ	 	 ddej        deeeef                  deeeef                  dej        fdZddddddej        dfdedee         dee         d	ee         dee	         d
eeee	f                  deeeef                  dedeeeef                  defdZ xZS )Pix2StructImageProcessoraj  
    Constructs a Pix2Struct image processor.

    Args:
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. According to Pix2Struct paper and code, the image is normalized with its own mean and standard
            deviation.
        patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
            The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
        max_patches (`int`, *optional*, defaults to 2048):
            The maximum number of patches to extract from the image as per the [Pix2Struct
            paper](https://huggingface.co/papers/2210.03347).
        is_vqa (`bool`, *optional*, defaults to `False`):
            Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
            rendered onto the input images.
    flattened_patchesattention_maskTN   Fdo_convert_rgbdo_normalize
patch_sizemax_patchesis_vqar?   c                      t                      j        di | ||nddd| _        || _        || _        || _        || _        d S )N   )rh   rC    )super__init__ry   rx   rw   rz   r{   )selfrw   rx   ry   rz   r{   rl   	__class__s          r/   r   z!Pix2StructImageProcessor.__init__   s^     	""6"""(2(>**r\^D_D_(,&r0   r`   rc   c           	      6   t          | j        d           t          |t          j        |          }t          j        |          }|d         |d         }}t          |t          j                  \  }}	t          j	        |||z  z  ||	z  z            }
t          t          t          j        |
|z  |z            |          d          }t          t          t          j        |
|	z  |z            |          d          }t          ||z  d          }t          ||z  d          }t
          j        j                            |                    d          ||fddd	                              d          }t%          |||          }|j        }|d         }|d
         }|d         }|                    ||z  |g          }t          j        |                              |dg                              d|                              ||z  dg          }t          j        |                              d|g                              |d                              ||z  dg          }|dz  }|dz  }|                    t
          j                  }|                    t
          j                  }t          j        |||gd          }t
          j        j                            |ddd|||z  z
  g                                          }t9          |          }|S )a  
        Extract flattened patches from an image.

        Args:
            image (`np.ndarray`):
                Image to extract flattened patches from.
            max_patches (`int`):
                Maximum number of patches to extract.
            patch_size (`dict`):
                Dictionary containing the patch height and width.

        Returns:
            result (`np.ndarray`):
                A sequence of `max_patches` flattened patches.
        r   rh   rC   r   r   bilinearFT)r)   modealign_corners	antialiasr"   r   r    )r   extract_flattened_patchesr   r   FIRSTr   
from_numpyr   mathsqrtrf   minfloorr%   r&   interpolater$   squeezer#   shaper(   arangerepeattofloat32catpadfloatr   )r   r`   rz   ry   rc   rl   r,   r-   r_   r^   scalenum_feasible_rowsnum_feasible_colsresized_heightresized_widthr.   patches_shaperowscolumnsdepthrow_idscol_idsresults                          r/   r   z2Pix2StructImageProcessor.extract_flattened_patches   s   . 	$8'BBB ,E3C3IK\]] ''$.x$8*W:Mk$25:J:P$Q$Q!k 	+)DEWbIbcddDJu|/Cl/R$S$SU` a acdeeDJu{/B[/P$Q$QS^ _ _abcc.=qAA-;Q??#//OOA -0 0 
 
 '!** 	 (|[IIQ"a  //4'>5"9:: ,t$$,,dAY77>>q'JJRRTX[bTbdeSfgg,w''//G==DDT1MMUUW[^eWeghVijj 	11 **U]++**U]++ GWg6;; $((!Q;$QX.;Y1Z[[aacc''r0   data_formatc           	      V   |j         t          j        k    r|                    t          j                  }t          j        |          }t          j        |          }t          |dt          j	        t          j
        |j                            z            }t          |f||||d|S )a  
        Normalize an image. image = (image - image_mean) / image_std.

        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization

        Args:
            image (`np.ndarray`):
                Image to normalize.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        g      ?)meanstdr   rc   )dtypenpuint8astyper   r   r   rf   r   r   prodr   r
   )r   r`   r   rc   rl   r   r   adjusted_stddevs           r/   r
   z"Pix2StructImageProcessor.normalize5  s    , ;"(""LL,,E wu~~fUmmc3275;3G3G)H)H#HII
#/
 
 
 
 	
r0   imagesheader_textreturn_tensorsc
                 
   	 ||n j         }||n j        }n j        n j         j        }|
                    d          t          d          t          |          }t          |          st          d          |rd |D             }d |D             }	t          |d                   	|rt          d          |

                    d	d          |

                    d
d          t          t                    rgt          |          z  fdt          |          D             }|r	 fd|D             }	 fd|D             }d |D             }t          ||d|          }|S )a  
        Preprocess an image or batch of images. The processor first computes the maximum possible number of
        aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
        image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
        images are standardized following the tensorflow implementation of `per_image_standardization`
        (https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).


        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images.
            header_text (`Union[list[str], str]`, *optional*):
                Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            max_patches (`int`, *optional*, defaults to `self.max_patches`):
                Maximum number of patches to extract.
            patch_size (`dict`, *optional*, defaults to `self.patch_size`):
                Dictionary containing the patch height and width.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        Nr   z8data_format is not an accepted input as the outputs are zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.c                 ,    g | ]}t          |          S r~   )r	   .0r`   s     r/   
<listcomp>z7Pix2StructImageProcessor.preprocess.<locals>.<listcomp>  s     @@@nU++@@@r0   c                 ,    g | ]}t          |          S r~   )r   r   s     r/   r   z7Pix2StructImageProcessor.preprocess.<locals>.<listcomp>  s     <<<E.''<<<r0   r   z.A header text must be provided for VQA models.r=   r>   c                 H    g | ]\  }}t          ||                    S ))r=   r>   )re   )r   ir`   r=   r>   r   s      r/   r   z7Pix2StructImageProcessor.preprocess.<locals>.<listcomp>  sC       Au e[^
V_```  r0   c                 >    g | ]}                     |           S ))r`   rc   )r
   )r   r`   rc   r   s     r/   r   z7Pix2StructImageProcessor.preprocess.<locals>.<listcomp>  s+    kkk[`dnn5DUnVVkkkr0   c                 B    g | ]}                     |           S ))r`   rz   ry   rc   )r   )r   r`   rc   rz   ry   r   s     r/   r   z7Pix2StructImageProcessor.preprocess.<locals>.<listcomp>  sJ     
 
 
  **_p +  
 
 
r0   c                 |    g | ]9}|                     d           dk                        t          j                  :S )r    )axisr   )sumr   r   r   r   s     r/   r   z7Pix2StructImageProcessor.preprocess.<locals>.<listcomp>  s;    \\\EEII2I..!3;;BJGG\\\r0   )rt   ru   )datatensor_type)rx   rw   ry   rz   r{   get
ValueErrorr   r   r   pop
isinstancestrlen	enumerater   )r   r   r   rw   rx   rz   ry   r   r   rc   rl   r{   attention_masksencoded_outputsr=   r>   s   ` `  ``  `    @@r/   
preprocessz#Pix2StructImageProcessor.preprocess\  s9   j (4'?||TEV+9+E4K^#-#9ZZt
%0%<kk$BR::m$$0WXXX)&11F## 	:    	A@@@@@F =<V<<<$ >vay I I 	" !QRRRL$77J

;55I+s++ :*mc&kk9      )& 1 1  F
  	lkkkkkdjkkkF
 
 
 
 
 
 
  	
 
 
 ]\U[\\\&'-QQ_m
 
 
 r0   )TTNrv   FN)NN)__name__
__module____qualname____doc__model_input_namesboolr   dictr   rg   r   r   ndarrayr   r   r   r
   r   r   r   r   __classcell__)r   s   @r/   rs   rs      st        ( -.>?  $!/3   T#s(^,	
   
     * EIO OzO O 	O
 $E#/?*?$@AO 
O O O Oh ?CDH	%
 %
z%
 eC)9$9:;%
 $E#/?*?$@A	%
 
%
 %
 %
 %
T &*)-'+%)/3;?(8(>DHq qq c]q !	q
 tnq c]q T#s(^,q !sJ!78q &q $E#/?*?$@Aq 
q q q q q q q qr0   rs   )	r1   r2   r3   r4   r4   r4   r4   NNr   )5r   rP   r   typingr   r   numpyr   huggingface_hubr   image_processing_utilsr   r   image_transformsr	   r
   r   r   image_utilsr   r   r   r   r   r   r   utilsr   r   r   r   utils.import_utilsr   rL   PILr   r   r   r   
get_loggerr   loggerrR   r#   r   rg   bytesrK   r   ChildProcessErrorre   rs   __all__r~   r0   r/   <module>r      s   , + 				  " " " " " " " "     + + + + + + F F F F F F F F d d d d d d d d d d d d                  R Q Q Q Q Q Q Q Q Q Q Q 3 3 3 3 3 3  0OOO////////// LLL		H	%	%$      : #"&#@ @
@@ @ 	@
 @ @ @ @ @ }@ [@ @ @ @J bf' ':'"'7?cK\F\@]7^' ' ' 'TP P P P P1 P P Pf &
&r0   