
     `i                         d dl mZmZ d dlZddlmZ ddlmZm	Z	 ddl
mZmZmZmZmZ ddlmZmZ  G d d	e          Z G d
 ded          Z G d de          ZdgZdS )    )OptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)AudioKwargsImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   &    e Zd ZU ee         ed<   dS )Gemma3nImagesKwargsdo_convert_rgbN)__name__
__module____qualname__r   bool__annotations__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/gemma3n/processing_gemma3n.pyr   r      s"         TN"""""r   r   c                   0    e Zd ZU eed<   eed<   dddiiZdS )Gemma3nProcessorKwargsaudio_kwargsimages_kwargstext_kwargspaddingFN)r   r   r   r	   r   r   	_defaultsr   r   r   r   r      s<         &&&&u
IIIr   r   F)totalc                       e Zd ZdZg dZdZdZdZ	 	 	 dd	ed
ef fdZ		 	 	 	 dde
e         deeeee         ee         f         de
eej        ee         eej                 eee                  f                  dee         def
dZ xZS )Gemma3nProcessorat  
    A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer
    into a single processor.

    Args:
        feature_extractor (`Gemma3nAudioFeatureExtractor`):
            Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This
            should return a `BatchFeature` with `input_features` and `input_features_mask` features.
        image_processor (`SiglipImageProcessorFast`):
            Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature`
            with a `pixel_values` feature.
        tokenizer (`GemmaTokenizerFast`):
            The text tokenizer for the model.
        chat_template (`string`, *optional*):
            A Jinja template for generating text prompts from a set of messages.
        audio_seq_length (int, *optional*, defaults to 188):
            The number of audio soft tokens that will be added to the text prompt
        image_seq_length (int, *optional*, defaults to 256):
            The number of image soft tokens that should be added to
    )feature_extractorimage_processor	tokenizerAutoFeatureExtractorAutoImageProcessorAutoTokenizerN      audio_seq_lengthimage_seq_lengthc                    || _         |j        | _        |j        | _        |j        | _        d                    |j        g|z            }d|j         | |j         d| _        || _        |j        | _        |j	        | _	        |j
        | _
        d                    |j
        g|z            }	d|j	         |	 |j         d| _         t                      j        d||||d| d S )N z

)r%   r&   r'   chat_templater   )r-   audio_token_id	boa_tokenaudio_tokenjoin	eoa_tokenfull_audio_sequencer.   image_token_id	boi_tokenimage_token	eoi_tokenfull_image_sequencesuper__init__)selfr%   r&   r'   r1   r-   r.   kwargsaudio_tokens_expandedimage_tokens_expanded	__class__s             r   r>   zGemma3nProcessor.__init__C   s     !1'6",$0 ")>(?BR(R S S#o)*=#o?T#oV_Vi#o#o#o  0'6",$0 ")>(?BR(R S S#o)*=#o?T#oV_Vi#o#o#o  	
/+'		
 	

 	
 	
 	
 	
 	
r   imagestextaudior@   returnc                     |||t          d            j        t          fd j        j        i|}t          |t                    r|g}n?t          |t                    s*t          |d         t                    st          d          |3  j        |fi |d         }|s fd|D             } fd|D             }ni }| j	        
                    |          }t          |          }  j	        |fi |d         }	|s fd	|D             }t          |          t          |          k    r0t          d
t          |           dt          |           d           fd|D             }ni }	|d                             dd           }
  j        dd|i|d         ddi}                     ||dg           |d         }t          j        |          }d|| j        k    <   d|| j        k    <   d |                                D             }|                                |d<   t+          i ||	||
          S )Nz5Provide at least one of `text`, `images`, or `audio`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr   c                     g | ]	}j         
S r   )r4   ).0_r?   s     r   
<listcomp>z-Gemma3nProcessor.__call__.<locals>.<listcomp>}   s    888Q(888r   c                 P    g | ]"}|                     j        j                  #S r   )replacer4   r7   rK   promptr?   s     r   rM   z-Gemma3nProcessor.__call__.<locals>.<listcomp>   .    bbbSYFNN4#3T5MNNbbbr   r   c                 f    g | ]-}d                      j        gt          |          z            .S ) )r5   r:   len)rK   rD   r?   s     r   rM   z-Gemma3nProcessor.__call__.<locals>.<listcomp>   s6    ```v$"2!3c&kk!ABB```r   z1Received inconsistently sized batches of images (z) and text (z).c                 P    g | ]"}|                     j        j                  #S r   )rO   r:   r<   rP   s     r   rM   z-Gemma3nProcessor.__call__.<locals>.<listcomp>   rR   r   r   return_tensorsrE   npimage)
modalities	input_ids   r   c                 >    i | ]\  }}||                                 S r   )tolist)rK   kvs      r   
<dictcomp>z-Gemma3nProcessor.__call__.<locals>.<dictcomp>   s&    EEEAq!((**EEEr   token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr   r'   init_kwargs
isinstancestrlistr%   r&   fetch_imagesr   rU   pop_check_special_mm_tokensrX   
zeros_liker8   r2   itemsr^   r   )r?   rD   rE   rF   videosr@   output_kwargsaudio_inputsbatched_imagesimage_inputsrW   text_inputs	array_idsrb   s   `             r   __call__zGemma3nProcessor.__call__c   s    <FNu}TUUU**"
 
"&."<
 
 
 dC   	b6DDD$'' 	b
47C0H0H 	b`aaa141%YY=;XYYL 98888%888 cbbb]abbbDDL)66v>>F7??N/4/aa-P_B`aaL  a````Q_```>""c$ii// vNH[H[vvilmqirirvvv  
 cbbb]abbbDDL&}599:JDQQ$dndd$d-2Ndd_cddd%%dKWI%NNN  ,	y11;<yD$778;<yD$778EE1B1B1D1DEEE(6(=(=(?(?$%!PK!P<!P<!P^lmmmmr   )Nr+   r,   )NNNN)r   r   r   __doc__
attributesfeature_extractor_classimage_processor_classtokenizer_classintr>   r   r   r   r   r   rj   rX   ndarrayfloatr   r   r   rw   __classcell__)rC   s   @r   r$   r$   (   s>        * GFFJ40%O  # #
 
 
 
 
 
 
 
 
D (,^b_c?n ?n$?n I0$y/4HYCZZ[?n bj$u+tBJ7GdSXkIZZ[\	?n /0?n 
?n ?n ?n ?n ?n ?n ?n ?nr   r$   )typingr   r   numpyrX   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   r   tokenization_utils_baser   r   r   r   r$   __all__r   r   r   <module>r      s<    # " " " " " " "     4 4 4 4 4 4 A A A A A A A A c c c c c c c c c c c c c c C C C C C C C C# # # # #, # # #    -U    zn zn zn zn zn~ zn zn znz 
r   