
     `i                         d dl Z d dlmZmZ d dlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZ ddlmZmZ ddlmZ  G d	 d
e          Z G d ded          Z G d de          ZdgZdS )    N)OptionalUnion   )BatchFeature)
ImageInputmake_nested_list_of_images)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)	to_py_objc                   ~    e Zd ZU ee         ed<   ee         ed<   ee         ed<   ee         ed<   ee         ed<   dS )Gemma3ImagesKwargsdo_pan_and_scanpan_and_scan_min_crop_sizepan_and_scan_max_num_crops"pan_and_scan_min_ratio_to_activatedo_convert_rgbN)__name__
__module____qualname__r   bool__annotations__intfloat     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/gemma3/processing_gemma3.pyr   r      sb         d^### (--- (---(0777TN"""""r    r   c                   6    e Zd ZU eed<   dddddddddd	Zd
S )Gemma3ProcessorKwargsimages_kwargsFT)paddingreturn_mm_token_type_ids      g333333?)r   r   r   r   r   )text_kwargsr$   N)r   r   r   r   r   	_defaultsr   r    r!   r#   r#   $   sU         %%%% (,
 

 #$*-*+25
 
 IIIr    r#   F)totalc            
            e Zd ZddgZdZdZ	 	 ddef fdZ	 	 	 	 dd	ee	         d
e
eeee         ee         f         dee         defdZddZed             Z xZS )Gemma3Processorimage_processor	tokenizerAutoImageProcessorAutoTokenizerNr'   image_seq_lengthc                    || _         |j        | _        |j        | _        |j        | _        d                    |j        g|z            }d|j         | |j         d| _         t                      j        d|||d| d S )N z

)r.   r/   chat_templater   )	r2   image_token_id	boi_tokenimage_tokenjoin	eoi_tokenfull_image_sequencesuper__init__)selfr.   r/   r5   r2   kwargsimage_tokens_expanded	__class__s          r!   r=   zGemma3Processor.__init__:   s     !1'6",$0 ")>(?BR(R S S#o)*=#o?T#oV_Vi#o#o#o  	
+'	
 	
 		
 	
 	
 	
 	
r    imagestextr?   returnc           	          ||t          d            j        t          fd j        j        i|}t          |t                    r|g}n?t          |t                    s*t          |d         t                    st          d          i }| j	        
                    |          }t          |          }  j	        |fi |d         }|s fd|D             }t          |          t          |          k    r0t          dt          |           dt          |           d	          t          |                    d
                    fd|D             }	t          t!          |||	                    D ]\  }
\  }}d t#          j         j        |          D             }t          |          t          |          k    r0t          dt          |           dt          |           d          t)          t          t!          |                              D ]c\  }}|r\d j         dd                     j        g|z            z   }|d |         |z   ||t           j                  z   d          z   }|||
<   d fd|D             }|d                             dd           }|d                             dd          }  j        dd|i|d         }                     ||dg           |rSt/          j        |d                   }t/          j        |          }d|| j        k    <   |                                |d<   t9          i |||          S ) Nz+Provide at least one of `text` or `images`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr$   c                 f    g | ]-}d                      j        gt          |          z            .S ) )r9   r7   len).0rB   r>   s     r!   
<listcomp>z,Gemma3Processor.__call__.<locals>.<listcomp>n   s5    ^^^V$.!1CKK!?@@^^^r    z1Received inconsistently sized batches of images (z) and text (z).	num_cropsc                 `    g | ]*}fd t          t          |                    D             +S )c                 :    g | ]}                     d           S )r   )pop)rJ   _rL   s     r!   rK   z7Gemma3Processor.__call__.<locals>.<listcomp>.<listcomp>w   s%    MMMQ	a 0 0MMMr    )rangerI   )rJ   rB   rL   s     r!   rK   z,Gemma3Processor.__call__.<locals>.<listcomp>w   s=    kkkRXMMMM%F:L:LMMMkkkr    c                 6    g | ]}|                                 S r   )start)rJ   ms     r!   rK   z,Gemma3Processor.__call__.<locals>.<listcomp>y   s      X X Xq X X Xr    zPrompt contained z image tokens but received z images.zHere is the original image z0 and here are some crops to help you see better rH   c                 P    g | ]"}|                     j        j                  #S r   )replacer7   r;   )rJ   promptr>   s     r!   rK   z,Gemma3Processor.__call__.<locals>.<listcomp>   s-    ```QWFNN4>43KLL```r    r)   return_tensorsr&   FrC   image)
modalities	input_ids   token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr#   r/   init_kwargs
isinstancestrlist	TypeErrorr.   fetch_imagesr   rI   r   rO   	enumerateziprefinditerr7   reversedr9   _check_special_mm_tokensnparray
zeros_liker6   tolistr   )r>   rB   rC   videosaudior?   output_kwargsimage_inputsbatched_imagesbatch_num_crops	batch_idxrW   image_indexesnumidxformatted_image_textrX   r&   text_inputs	array_idsmm_token_type_idsrL   s   `                    @r!   __call__zGemma3Processor.__call__P   s    <FNJKKK**!
 
"&."<
 
 
 dC   	a6DDD$'' 	a
47C0H0H 	a_```)66v>>F7??N/4/YY-:XYYL  _^^^^~^^^>""c$ii// vNH[H[vvilmqirirvvv  
 ","2"2;"?"?@@Ikkkk\jkkkO:CCn^mDnDn:o:o 1 16	6FFI X XBKPV4W4W X X Xv;;#m"4"444$pC,>,>pp[^_e[f[fppp  
 !)c)].K.K)L)L M M 1 1HC 1z$.zzz!hh'7#'=>>? - "(0D!DvcTWX\XfTgTgNgNiNiGj!j*0Y1 a```[_```D&}599:JDQQ#0#?#C#CD^`e#f#f $dnOO$O-2NOO%%dKWI%NNN $ 	G[!9::I "i 8 8BCi4+>>?,=,D,D,F,FK()!@K!@<!@n]]]]r    c                     i }|C| j         gt          |          z  }dgt          |          z  }|                    ||d           t          di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr\   )num_image_tokensnum_image_patchesr   )r2   rI   updater
   )r>   image_sizesr?   vision_datar   r   s         r!   _get_num_multimodal_tokensz*Gemma3Processor._get_num_multimodal_tokens   so     " $ 56[9I9II!"c+&6&6 64D[lmmnnn,,,,,r    c                 v    | j         j        dgz   }| j        j        }d |D             }t          ||z             S )Nr]   c                     g | ]
}|d k    |S )rL   r   )rJ   names     r!   rK   z5Gemma3Processor.model_input_names.<locals>.<listcomp>   s$    &k&k&kW[_jWjWjtWjWjWjr    )r/   model_input_namesr.   re   )r>   tokenizer_input_namesimage_processor_input_namess      r!   r   z!Gemma3Processor.model_input_names   sL     $ @DTCU U&*&:&L#&k&k8S&k&k&k#),GGHHHr    )Nr'   )NNNN)N)r   r   r   
attributesimage_processor_classtokenizer_classr   r=   r   r   r   r   r   re   r   r#   r   r   r   propertyr   __classcell__)rA   s   @r!   r-   r-   5   s#       #[1J0%O  #
 

 
 
 
 
 
 
0 (,^bI^ I^$I^ I0$y/4HYCZZ[I^ ./I^ 
I^ I^ I^ I^V- - - -. I I XI I I I Ir    r-   )rj   typingr   r   numpyrn   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   r   tokenization_utils_baser   r   utilsr   r   r#   r-   __all__r   r    r!   <module>r      sW    
			 " " " " " " " "     4 4 4 4 4 4 A A A A A A A A f f f f f f f f f f f f f f C C C C C C C C      # # # # # # # #    ,E    "CI CI CI CI CIn CI CI CIL 
r    