
     `i.                        d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	Z
d dlZd dlmc mZ d dlZd dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZBmCZCmDZD ddlEmFZF ddlGmHZH ddlImJZJmKZKmLZLmMZMmNZN ddlOmPZPmQZQ ddlRmSZS ddlTmUZU ddlVmWZWmXZXmYZY  e?            rd dlZZZ e@j[        e\          Z] G d deU          Z^ G d d eH          Z_ G d! d"e          Z`e< G d# d$e6                      Zae e<d%&           G d' d(e3                                  Zb G d) d*eP          Zc G d+ d,eQ          Zd G d- d.eY          Ze G d/ d0ejf                  Zg G d1 d2ejf                  Zh G d3 d4eX          Zi G d5 d6eW          Zj G d7 d8eF          Zk G d9 d:ejf                  Zl G d; d<eN          Zm G d= d>eM          Zn G d? d@eK          Zo G dA dBeL          Zp G dC dDejf                  Zq G dE dFejf                  Zr G dG dHejf                  Zs G dI dJejf                  Zt G dK dLeJ          Zu G dM dNejf                  Zv G dO dPejf                  Zw e<dQ&           G dR dSea                      Zx G dT dUeae          Zy G dV dWe          Zzg dXZ{dS )Y    N)Iterable)	dataclass)CallableOptionalUnion)nn)BlipImageProcessor   )ACT2FN)Cache)PretrainedConfig)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)BatchFeatureget_size_dict)convert_to_rgbresizeto_channel_dimension_format)
ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatis_scaled_imagemake_flat_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)ModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
TensorTypeTransformersKwargsauto_docstringcan_return_tuplefilter_out_non_signature_kwargsis_vision_availablelogging   )CONFIG_MAPPING
AutoConfig	AutoModel)Blip2VisionModel)ChameleonVQVAEConfig)ChameleonVQVAEChameleonVQVAEEncoderAttnBlock#ChameleonVQVAEEncoderConvDownsample ChameleonVQVAEEncoderResnetBlockChameleonVQVAEVectorQuantizer)IdeficsBaseModelOutputWithPastIdeficsCausalLMOutputWithPast)eager_attention_forward)SiglipVisionConfig)SiglipEncoderSiglipEncoderLayerSiglipVisionEmbeddingsc                   P     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )JanusVisionConfiga
  
    This is the configuration class to store the configuration of a [`JanusVisionModel`]. It is used to instantiate a
    `JanusVisionModel` according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        image_size (`int`, *optional*, defaults to 384):
            The size (resolution) of each image.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for attention weights.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"`, and `"gelu_new"` are supported.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            Ratio of MLP hidden dimensionality to embedding dimensionality.
        attention_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys, and values in the attention layers.
        hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout probability for fully connected layers in the encoder.
        projection_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the MLP projection head.
        projection_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for the projection layer.
        use_qk_norm (`bool`, *optional*, defaults to `False`):
            Whether to normalize the query and key matrices.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated normal initializer for initializing all weight matrices.
        depth (`int`, *optional*, defaults to 2):
            Number of hidden layers in the aligner module.
        num_image_tokens (`int`, *optional*, defaults to 576):
            Number of image tokens.
    janus_vision_modelvision_config         r
             ư>gelu      @T   F{Gz?r-   @  c                      t                      j        d|||||||||	d	| | `|
| _        || _        || _        || _        || _        || _        || _	        || _
        || _        d S )N)	hidden_sizenum_hidden_layersnum_attention_headsnum_channels
patch_size
image_sizeattention_dropoutlayer_norm_eps
hidden_act )super__init__intermediate_size	mlp_ratioattention_biashidden_dropout_rateprojection_dimprojection_dropoutuse_qk_norminitializer_rangedepthnum_image_tokens)selfrO   rP   rQ   rR   rS   rT   rU   rV   rW   r\   r]   r^   r_   r`   ra   rb   rc   rd   kwargs	__class__s                       {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/janus/modular_janus.pyrZ   zJanusVisionConfig.__init__   s    , 	 	
#/ 3%!!/)!	
 	
 	
 	
 	
 "",#6 ,"4&!2
 0    )rC   rD   rE   r
   rE   rF   rG   rH   rI   rJ   TrG   rK   rG   FrL   r-   rM   )__name__
__module____qualname____doc__
model_typebase_config_keyrZ   __classcell__rg   s   @rh   r@   r@   T   s        , ,\ &J%O ',1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1ri   r@   c                        e Zd ZdZddddddddg d	d
dddd
ddfdededededededededee         dedef fdZ xZ	S )JanusVQVAEConfiga:
  
    This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a
    `JanusVQVAEModel` according to the specified arguments, defining the model architecture.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information. Instantiating a
    configuration with the defaults will yield a similar configuration to the VQModel of the
    [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B).

    Args:
        embed_dim (`int`, *optional*, defaults to 8):
            Dimensionality of each embedding vector.
        num_embeddings (`int`, *optional*, defaults to 16384):
            Number of codebook embeddings.
        double_latent (`bool`, *optional*, defaults to `False`):
            Whether to use double z channels.
        latent_channels (`int`, *optional*, defaults to 256):
            Number of channels for the latent space.
        num_patches (`int`, *optional*, defaults to 32):
            Num of patches the input images can be divided into.
        in_channels (`int`, *optional*, defaults to 3):
            Number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            Number of out channels.
        base_channels (`int`, *optional*, defaults to 128):
            Base channel count.
        channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
            Channel multipliers for each resolution.
        num_res_blocks (`int`, *optional*, defaults to 2):
            Number of residual blocks.
        dropout (`float`, *optional*, defaults to 0.0):
            Dropout rate.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        projection_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the MLP projection head.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in VAVAE MLP Connecter module.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        image_token_embed_dim (`int`, *optional*, defaults to 2048):
            Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
       i @  F       r
      )   rx   r-   r-      r-   rG   rL   rK   rI   	embed_dimnum_embeddingsdouble_latentlatent_channelsnum_patchesin_channelsout_channelsbase_channelschannel_multipliernum_res_blocksdropoutc                      t                      j        d|||||||	|
||d
| || _        || _        || _        || _        || _        || _        | `| `	| `
d S )N)
rz   r{   r|   r}   r   r   r   r   r   rb   rX   )rY   rZ   r~   r   r_   rP   rW   image_token_embed_dim
resolutionattn_resolutions	attn_type)re   rz   r{   r|   r}   r~   r   r   r   r   r   r   rb   r_   rP   rW   r   rf   rg   s                     rh   rZ   zJanusVQVAEConfig.__init__   s    ( 	 	
)'+#'1)/	
 	
 	
 	
 	
 '(,!2$%:"O!NNNri   )
rj   rk   rl   rm   intboollistfloatrZ   rp   rq   s   @rh   rs   rs      s        * *\ ##" (7"#* ** * 	*
 * * * * * !I* * * * * * * * * * * *ri   rs   c                   <     e Zd ZdZdZeeedZ	 	 	 	 d fd	Z	 xZ
S )JanusConfiga;  
    This is the configuration class to store the configuration of a [`JanusModel`]. It is used to instantiate an
    Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Janus-1B or Janus-7B models.

    e.g. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B) or
    [deepseek-community/Janus-Pro-7B](https://huggingface.co/deepseek-community/Janus-Pro-7B)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
            The config object or dictionary of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVisionConfig`):
            The config object or dictionary of the vision backbone.
        vq_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVQVAEConfig`):
            The config object or dictionary of the VQVAE backbone.
        image_token_id (`int`, *optional*, defaults to 100581):
            Token index of a placeholder image token.

    Example:

    ```python
    >>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig

    >>> # Initializing a Janus vision config
    >>> vision_config = JanusVisionConfig()

    >>> # Initializing a Llama config
    >>> text_config = LlamaConfig()

    >>> # Initializing a VQ config
    >>> vq_config = JanusVQVAEConfig()

    >>> # Initializing a Janus Pro 1B style configuration
    >>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)

    >>> # Initializing a model from the Janus Pro 1B style configuration
    >>> model = JanusForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```janus)text_configrB   	vq_configN c                 x   t          |t                    r7|                    dd          |d<   t          |d                  d	i || _        nr|4t
                              d           t          d                     | _        n<t          |t                    r|| _        nt          dt          |                     |.t
                              d           t                      | _        nct          |t                    rt          d	i || _        n<t          |t                    r|| _        nt          dt          |                     |.t
                              d           t                      | _        nct          |t                    rt          d	i || _        n<t          |t                    r|| _        nt          dt          |                     | j        j        | _        | j        j        | j        j        z  | j        _        || _         t'                      j        d	i | d S )
Nrn   llamaz7`text_config` is None. Initializing with default valueszTInvalid type for `text_config`. Must be either `dict` or `LlamaConfig`. Type found: zK`vision_config` is None. Initializing with default JanusVisionConfig valuesz\Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`. Type found: zF`vq_config` is None. Initializing with default JanusVQVAEConfig valueszWInvalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`. Type found: rX   )
isinstancedictgetr.   r   loggerinfor   
ValueErrortyper@   rB   rs   r   rb   rT   rS   r~   image_token_idrY   rZ   )re   r   rB   r   r   rf   rg   s         rh   rZ   zJanusConfig.__init__D  s[    k4(( 	(3g(N(NK%-k,.GHWW;WWD KKQRRR-g688D%566 	*D4 $[ 1 14 4  
  KKefff!2!4!4Dt,, 	!2!C!C]!C!CD'899 	!.D6 $] 3 36 6  
 KK`aaa-//DNN	4(( 	-::	::DNN	#344 	&DNN2 $Y2 2  
 "&!3!E%)%7%BdFXFc%c",""6"""""ri   )NNNr   )rj   rk   rl   rm   rn   r/   r@   rs   sub_configsrZ   rp   rq   s   @rh   r   r     sr        + +Z J!*% K 6# 6# 6# 6# 6# 6# 6# 6# 6# 6#ri   r   c                   B    e Zd ZU eed<   dZdZddgZddgZdZ	dZ
dZdZd	S )
JanusPreTrainedModelconfigmodelTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskFN)rj   rk   rl   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph!_supports_param_buffer_assignmentrX   ri   rh   r   r   }  sX         &*#,.GH#4m"DN!(-%%%ri   r   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   \    e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dS )JanusVQVAEOutputz
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)
rj   rk   rl   rm   r   r   torchFloatTensorr   r   rX   ri   rh   r   r     sO           9=(5#45<<<26NHU./66666ri   r   c                       e Zd ZdS )JanusBaseModelOutputWithPastNrj   rk   rl   rX   ri   rh   r   r             Dri   r   c                       e Zd ZdS )JanusCausalLMOutputWithPastNr   rX   ri   rh   r   r     r   ri   r   c                   8    e Zd Zddej        dedej        fdZdS )JanusVisionEmbeddingsFpixel_valuesinterpolate_pos_encodingreturnc                 V   |j         \  }}}}| j        j        j        }|                     |                    |                    }|                    d                              dd          }|r|                     |||          }	n|                     | j	                  }	||	z   }|S )Ndtyper-   rx   )
shapepatch_embeddingweightr   toflatten	transposer   position_embeddingposition_ids)
re   r   r   _heightwidthtarget_dtypepatch_embeds
embeddings
pos_embedss
             rh   forwardzJanusVisionEmbeddings.forward  s    *01fe+28++LOO,O,O,OPP!))!,,66q!<<
# 	D66z65QQJJ001BCCJ*,
ri   N)F)rj   rk   rl   r   Tensorr   r   rX   ri   rh   r   r     sH         EL D ]b]i      ri   r   c                   l     e Zd ZdZdef fdZ	 d	dej        deej                 de	e
         fdZ xZS )
JanusVisionAttentionz(Attention Class for Janus Vision Encoderr   c                 6   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _
        |j        }|j        }d| _        d| _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j                  | _        |dk    rt          j        |          nt          j                    | _        |rt          j        | j                  nt          j                    | _        |rt          j        | j                  nt          j                    | _        d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Frx   biasr   )rY   rZ   r   rO   rz   rQ   	num_headshead_dimr   scalerU   r`   ra   	is_causalnum_key_value_groupsr   Linearr]   q_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)re   r   proj_dropoutqk_normrg   s       rh   rZ   zJanusVisionAttention.__init__  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
!'!90$ %&!i0NU[Ujkkki0NU[Ujkkki0NU[Ujkkk "	$.$. I I>JQ>N>N"*\":":":TVT_TaTa6=Pbl4>2222;==6=Pbl4>2222;==ri   Nhidden_statesattention_maskrf   c                    |                                 \  }}}|                     |          }|                     |          }|                     |          }	|                    d| j        | j                  }|                     |          }|                    d| j        | j                  }|                     |          }|                    ||| j        | j                  	                    dd          }|                    ||| j        | j                  	                    dd          }|	
                    ||| j        | j                  	                    dd          }	t          }
| j        j        dk    rt          | j        j                 }
 |
| |||	|f| j        sdn| j        | j        | j        d|\  }}|                    ||| j                  }|                     |          }|                     |          }||fS )Nrx   r-   eagerrG   )r   scalingr   )sizer   r   r   reshaper   r   r   r   r   viewr:   r   _attn_implementationr#   trainingrU   r   r   rz   r   r`   )re   r   r   rf   
batch_sizeseq_lenr   query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightsoutputs                 rh   r   zJanusVisionAttention.forward  s    "/!3!3!5!5
GQ{{=11[[//
{{=11#++BNN{{<00''DNDMJJ
[[,,
#++JQUQ^__iijkmnoo''
GT^T][[eefgijkk
#((Wdndm\\ffghjkll(?;+w66"9$+:Z"[$7$7
%
  $}HCC$2HJn
%
 
%
 
%
 
%
!\ "))*gt~NN&&{33((00|##ri   N)rj   rk   rl   rm   r@   rZ   r   r   r   r%   r'   r   rp   rq   s   @rh   r   r     s        22Q0 Q Q Q Q Q Q@ 26)$ )$|)$ !.)$ +,	)$ )$ )$ )$ )$ )$ )$ )$ri   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )JanusVisionMLPr   c                    t                                                       || _        t          |j        |j        z            | _        t          |j                 | _	        t          j        |j        | j                  | _        t          j        | j        |j                  | _        t          j        |j                  | _        t          j        |j                  | _        d S r   )rY   rZ   r   r   rO   r\   r[   r   rW   activation_fnr   r   fc1fc2r   r^   dropout1dropout2re   r   rg   s     rh   rZ   zJanusVisionMLP.__init__  s    !$V%7&:J%J!K!K#F$569V/1GHH9T3V5GHH
6#=>>
6#=>>ri   r   r   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r   )r  r  r  r  r  re   r   s     rh   r   zJanusVisionMLP.forward  s_    //**=99m44//m44ri   )	rj   rk   rl   r@   rZ   r   r   r   rp   rq   s   @rh   r  r    sk        ?0 ? ? ? ? ? ?U\ el        ri   r  c                   $     e Zd Zdef fdZ xZS )r   r   c                 T   t                                          |           || _        |j        | _        t          |          | _        t          j        | j        |j	                  | _
        t          j        | j        |j	                  | _        t          |          | _        d S )N)eps)rY   rZ   r   rO   rz   r   	self_attnr   r   rV   layer_norm1layer_norm2r  mlpr	  s     rh   rZ   z JanusVisionEncoderLayer.__init__  s       +-f55<F<QRRR<F<QRRR!&))ri   rj   rk   rl   r@   rZ   rp   rq   s   @rh   r   r     sE        *0 * * * * * * * * * *ri   r   c                   $     e Zd Zdef fdZ xZS )JanusVisionEncoderr   c                     t                                                     t          j        fdt	          j                  D                       | _        d S )Nc                 .    g | ]}t                    S rX   )r   .0r   r   s     rh   
<listcomp>z/JanusVisionEncoder.__init__.<locals>.<listcomp>$  s"    $n$n$n%<V%D%D$n$n$nri   )rY   rZ   r   
ModuleListrangerP   layersr	  s    `rh   rZ   zJanusVisionEncoder.__init__"  sR       m$n$n$n$neTZTlNmNm$n$n$noori   r  rq   s   @rh   r  r  !  sP        p0 p p p p p p p p p pri   r  c                   $     e Zd Zdef fdZ xZS )JanusVisionModelr   c                 r    t                                          |           t          |          | _        d S r   )rY   rZ   r  encoderr	  s     rh   rZ   zJanusVisionModel.__init__(  s.       )&11ri   r  rq   s   @rh   r  r  '  sE        20 2 2 2 2 2 2 2 2 2 2ri   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVisionAlignerMLPr   c                 0   t                                                       t          j        j        j                  | _        t          j        fdt          dj	                  D                       | _
        t          j                 | _        d S )Nc                 N    g | ]!}t          j        j        j                  "S rX   r   r   r_   r  s     rh   r  z2JanusVisionAlignerMLP.__init__.<locals>.<listcomp>3  s+    eeeRYv,f.CDDeeeri   rx   )rY   rZ   r   r   rO   r_   r  r  r  rc   hidden_layersr   rW   r  r	  s    `rh   rZ   zJanusVisionAlignerMLP.__init__.  s    9V/1FGG]eeeeeTUW]WcNdNdeee
 
 $F$56ri   c                     |                      |          }| j        D ]"}|                     |          } ||          }#|S r   r  r'  r  re   r   layers      rh   r   zJanusVisionAlignerMLP.forward7  O    //' 	1 	1E ..}==M!E-00MMri   )rj   rk   rl   r@   rZ   r   rp   rq   s   @rh   r#  r#  -  sT        70 7 7 7 7 7 7      ri   r#  c                   H     e Zd Zdef fdZdej        dej        fdZ xZ	S )JanusVQVAEVectorQuantizerr   c                 j    t                                          |           |j        gdz  | _        d S )Nr-   )rY   rZ   r~   quant_state_dimsr	  s     rh   rZ   z"JanusVQVAEVectorQuantizer.__init__@  s4       !'!3 4q 8ri   image_tokensr   c                 :   |j         d         }| j        j        j         d         }|                     |          }t          j        |dd          }|                    |g| j        |R           }|                    dddd                                          }|S )Nr   r   r-   )pdimr
   rx   )	r   	embeddingr   F	normalizer   r0  permute
contiguous)re   r1  r   emb_dimhidden_state_quants        rh   get_codebook_entryz,JanusVQVAEVectorQuantizer.get_codebook_entryD  s    !'*
~,226 "^^L99[);qbIII 044j5b4CX5bZa5b5bcc/771aCCNNPP!!ri   )
rj   rk   rl   rs   rZ   r   
LongTensorr   r<  rp   rq   s   @rh   r.  r.  ?  sm        9/ 9 9 9 9 9 9"u/? "EDU " " " " " " " "ri   r.  c                       e Zd ZdS )JanusVQVAEResnetBlockNr   rX   ri   rh   r?  r?  T  r   ri   r?  c                       e Zd ZdS )JanusVQVAEAttnBlockNr   rX   ri   rh   rA  rA  X  r   ri   rA  c                       e Zd ZdS )JanusVQVAEConvDownsampleNr   rX   ri   rh   rC  rC  \  r   ri   rC  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvUpsamplec                     t                                                       t          j                            ||ddd          | _        d S )Nr
   rx   kernel_sizestridepadding)rY   rZ   r   r   Conv2dconv)re   r   rg   s     rh   rZ   zJanusVQVAEConvUpsample.__init__a  s>    HOOK!TU_`Oaa			ri   c                 ^    t          j        |dd          }|                     |          }|S )Ng       @nearest)scale_factormode)r6  interpolaterL  r  s     rh   r   zJanusVQVAEConvUpsample.forwarde  s/    m#IVVV		-00ri   )rj   rk   rl   rZ   r   rp   rq   s   @rh   rE  rE  `  sL        b b b b b      ri   rE  c                   L     e Zd Zdedef fdZdej        dej        fdZ xZ	S )JanusVQVAEMidBlockr   channelsc                     t                                                       t          |||          | _        t	          |          | _        t          |||          | _        d S )Nr   r   r   )rY   rZ   r?  block_1rA  attn_1block_2)re   r   rT  rg   s      rh   rZ   zJanusVQVAEMidBlock.__init__l  sl    , !
 
 

 *(33, !
 
 
ri   r   r   c                     |                      |          }|                     |          }|                     |          }|S r   )rW  rX  rY  r  s     rh   r   zJanusVQVAEMidBlock.forwardz  s;    ]33M22]33ri   )
rj   rk   rl   rs   r   rZ   r   r   r   rp   rq   s   @rh   rS  rS  k  sr        
/ 
3 
 
 
 
 
 
U\ el        ri   rS  c                   4     e Zd Z fdZdej        fdZ xZS )JanusVQVAEEncoderc           	         t                                                       t          |j                  | _        |j        | _        |j        }|j        }|j        }|j	        }|j        }t          j                            ||ddd          | _        dt          |          z   }|| _        t          j                    | _        t%          | j                  D ]
}t          j                    }	t          j                    }
|||         z  }|||         z  }t%          | j                  D ]Y}|	                    t)          |||                     |}|| j        dz
  k    r"|
                    t+          |                     Zt          j                    }|	|_        |
|_        || j        dz
  k    rt3          |          |_        | j                            |           t7          ||          | _        t          j                            d|dd	          | _        t          j                            ||rd
|z  n|ddd          | _        d S )Nr
   rx   rG  )rx   rV  rv   rH   T
num_groupsrR   r  affiner-   ) rY   rZ   lenr   num_resolutionsr   r   r   r|   r}   r   r   rK  conv_intuplein_channel_multiplierr  downr  appendr?  rA  ModuleblockattnrC  
downsamplerS  mid	GroupNormnorm_outconv_out)re   r   r   r   r|   r}   r   re  i_levelri  rj  block_in	block_outi_blockrf  rg   s                  rh   rZ   zJanusVQVAEEncoder.__init__  s@   "6#<==$3,(, 0#6x{MqYZdeff $u-?'@'@ @%:"MOO	T122 	# 	#GMOOE=??D$'<W'EEH%(:7(CCI !455 
? 
?)%$,%.     %d2Q666KK 3H = =>>>9;;DDJDI$.222":8"D"DIT""""%fh77**bxUYbf*gg#0EAo ( 
 
ri   r   c                    |                      |          g}t          | j                  D ]}t          | j                  D ]} | j        |         j        |         |d                   }t          | j        |         j                  dk    r! | j        |         j        |         |          }|                    |           || j        dz
  k    r9|                    | j        |         	                    |d                              |d         }| 
                    |          }|                     |          }|t          j        |          z  }|                     |          }|S )Nr   r   rx   )rc  r  rb  r   rf  ri  ra  rj  rg  rk  rl  rn  r   sigmoidro  )re   r   r   rp  rs  hidden_statelast_hidden_states          rh   r   zJanusVQVAEEncoder.forward  so   l334T122 		W 		WG !455 3 3@ty17@!"%    ty).//!33#C49W#5#:7#CL#Q#QL$$\2222$.222$$TYw%7%B%B=QSCT%U%UVVV *"- HH%677 !MM*;<<U]+<=== MM*;<<  ri   )rj   rk   rl   rZ   r   r=  r   rp   rq   s   @rh   r\  r\    sW        1
 1
 1
 1
 1
f!E$4 ! ! ! ! ! ! ! !ri   r\  c                   B     e Zd Z fdZdej        dej        fdZ xZS )JanusVQVAEDecoderc           	      z   t                                                       t          |j                  | _        |j        | _        |j        }|j        }|j        }||j        | j        dz
           z  }t          j
                            ||ddd          | _        t          ||          | _        t          j                    | _        t#          t%          | j                            D ]}t          j                    }t          j                    }||j        |         z  }	t%          | j        dz             D ]Y}
|                    t)          |||	                     |	}|| j        dz
  k    r"|                    t+          |                     Zt          j                    }||_        ||_        |dk    rt3          |          |_        | j                            |           t          j
                            d|dd	          | _        t          j
                            ||ddd          | _        d S )
Nrx   r
   rG  rV  r   rv   rH   Tr^  )rY   rZ   ra  r   rb  r   r   r}   r   r   r   rK  rc  rS  rl  r  upreversedr  rg  r?  rA  rh  ri  rj  rE  upsamplerm  rn  ro  )re   r   r   r}   r   rq  rp  ri  rj  rr  rs  r{  rg   s               rh   rZ   zJanusVQVAEDecoder.__init__  s   "6#<==$3, 0* !6#<T=QTU=U#VV xaXYcdee &fh77 -//d&: ; ;<< 	 	GMOOE=??D%(A'(JJI !4q!899 
? 
?)%$,%.     %d2Q666KK 3H = =>>>BBHBG!||4X>>GNN2 **bxUYbf*gg,AVWabccri   rv  r   c                 d   |                      |          }|                     |          }t          | j                  D ]}t          | j        dz             D ]g} | j        |         j        |         |          }t          | j        |         j                  dk    r! | j        |         j        |         |          }h|| j        dz
  k    r | j        |         	                    |          }| 
                    |          }|t          j        |          z  }|                     |          }|S )Nrx   r   )rc  rl  r  rb  r   r{  ri  ra  rj  r}  rn  r   ru  ro  )re   rv  rp  rs  s       rh   r   zJanusVQVAEDecoder.forward  s)   ||L11 xx-- T122 	G 	GG !4q!899 P P>tww/5g>|LLtww',--11#A477#3#8#A,#O#OL$.222#ww/88FF}}\22l333}}\22ri   )rj   rk   rl   rZ   r   r   r   rp   rq   s   @rh   ry  ry    sf        ,d ,d ,d ,d ,d\E$5 %:K        ri   ry  c                        e Zd Zg dZdZdef fdZdej        dej	        fdZ
eedej	        deej	        ej	        f         fd                        Z xZS )	
JanusVQVAE)rA  r?  r.  r   r   c                     t                                          |           t          |          | _        d| _        |                                  d S )NF)rY   rZ   ry  decodergradient_checkpointing	post_initr	  s     rh   rZ   zJanusVQVAE.__init__  sJ       (00&+# 	ri   r1  r   c                 r   |j         d         | j        j        d         | j        j        d         z  k    r>t          d| j        j        d         | j        j        d         z   d|j          d          | j                            |          }|                     |          }|                     |          }|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        rx   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   quantizer0  r   r<  post_quant_convr  )re   r1  codebook_entryr   r   s        rh   decodezJanusVQVAE.decode"  s     a DM$B1$EHfghHi$iii9t}GefgGhkokx  lJ  KL  lM  HM 9 9"."49 9 9   99,GG,,^<<||M22ri   c                     |j         d         }|                     |          \  }}}|                     |                    |d                    }t	          ||          S )Nr   r   )r   encoder  r   r   )re   r   r   quantr   indicesr   s          rh   r   zJanusVQVAE.forward5  sZ     "'*
)-\)B)B&~w#{{7<<
B+G+GHH 4nEEEri   )rj   rk   rl   r   main_input_namers   rZ   r   r=  r   r  r)   r(   rd  r   rp   rq   s   @rh   r  r    s          
 %O/      5#3 8I    & F'F 
u %"33	4F F F ^ F F F F Fri   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVQVAEAlignerMLPr   c                 0   t                                                       t          j        j        j                  | _        t          j        fdt          dj	                  D                       | _
        t          j                 | _        d S )Nc                 N    g | ]!}t          j        j        j                  "S rX   r&  r  s     rh   r  z1JanusVQVAEAlignerMLP.__init__.<locals>.<listcomp>H  s+    qqqRYv,f.CDDqqqri   rx   )rY   rZ   r   r   rz   r_   r  r  r  rP   r'  r   rW   r  r	  s    `rh   rZ   zJanusVQVAEAlignerMLP.__init__C  s    9V-v/DEE]qqqqeTUW]WoNpNpqqq
 
 $F$56ri   c                     |                      |          }| j        D ]"}|                     |          } ||          }#|S r   r)  r*  s      rh   r   zJanusVQVAEAlignerMLP.forwardL  r,  ri   )rj   rk   rl   rs   rZ   r   rp   rq   s   @rh   r  r  B  sT        7/ 7 7 7 7 7 7      ri   r  c                   L     e Zd ZdZdef fdZdej        dej        fdZ	 xZ
S )JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r   c                    t                                                       t          j        |j        |j                  | _        t          |j                 | _	        t          j        |j        |j
                  | _        d S r   )rY   rZ   r   r   r   r_   proj_outr   rW   r  r{   vision_headr	  s     rh   rZ   zJanusVQVAEHead.__init__W  sb    	&">@UVV#F$569V%:F<QRRri   r   r   c                     |                      |          }|                     |          }|                     |          }|S r   )r  r  r  r  s     rh   r   zJanusVQVAEHead.forward]  s?    m44**=99((77ri   )rj   rk   rl   rm   rs   rZ   r   r   tensorr   rp   rq   s   @rh   r  r  T  sx        YYS/ S S S S S SU\ el        ri   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                   ~    e Zd Zdef fdZd Zd Zd Zdej	        dej
        dej
        fd	Zee	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej                 deej	                 dee         deej	                 deej
                 dee         deeej        f         fd                        Z xZS )
JanusModelr   c                    t                                          |           || _        t                              |j                  | _        t          | j        j                  | _        t                              |j
                  | _        t          j        | j        j        j        | j        j        j                  | _        t#          | j        j                  | _        t'          | j        j                  | _        t+          j        |j                  | _        d| _        |                                  d S )N)r   F)rY   rZ   r   r  _from_configrB   vision_modelr#  alignerr  r   vqmodelr   	Embeddingr{   rz   generation_embeddingsr  generation_alignerr  generation_headr0   from_configr   language_modelr  r  r	  s     rh   rZ   zJanusModel.__init__j  s       ,99&:NOO,T->-EFF!..v/?@@ &(\$,2E2TVZVbViVs%t%t""6t|7J"K"K-dl.ABB'36;MNNN&+#ri   c                 4    | j                                         S r   )r  get_input_embeddingsre   s    rh   r  zJanusModel.get_input_embeddings  s    "77999ri   c                 :    | j                             |           d S r   )r  set_input_embeddingsre   values     rh   r  zJanusModel.set_input_embeddings  s    0077777ri   c                 d    |                      |          }|                     |j                  }|S r   )r  r  rw  )re   r   image_embedss      rh   get_image_featureszJanusModel.get_image_features  s/    ((66||L$BCCri   	input_idsinputs_embedsimage_featuresc                 \   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                                }|	                    d          
                    |                              |j                  }||                                         |                                k    r0|j        d         |j        d         z  }t          d| d|           |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr   devicer   r   rx   z6Image features and image tokens do not match: tokens: z, features )r  r   r  r   r   longr  allsum	unsqueeze	expand_asr   numelr   r   )re   r  r  r  special_image_maskn_image_tokensn_image_featuress          rh   get_placeholder_maskzJanusModel.get_placeholder_mask  s/    !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!*dk.H!H+//11/99"==GGVVYYZgZnoo+,22448L8L8N8NNN-3A69Ma9PPvvvdtvv   "!ri   Nr   r   r   r   r   cache_position	use_cachelogits_to_keepc
                    |d u |d uz  rt          d          | |                                 |          }||                     |          }|                    d|j        d                   }|                    |j        |j                  }|                     |||          }|	                    ||          } | j
        d|||||||	d|
}t          |j        |j        |j        |j        ||nd           S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   )r  r  )r  r   r   r   r  r  r  )rw  r   r   
attentionsimage_hidden_statesrX   )r   r  r  r   r   r   r  r   r  masked_scatterr  r   rw  r   r   r  )re   r  r   r   r   r   r  r  r  r  rf   r  r  image_attention_mask	lm_outputs                  rh   r   zJanusModel.forward  sX    -t";< 	s    7D5577	BBM#22<@@L)11"m6I"6MNNN+..}/C]EXYYN#'#<#<~ $= $ $  *889M~^^M'D' 	
')%+))	
 	
 	
 	
	 ,'9%5#1 +0<0Hd
 
 
 	
ri   )	NNNNNNNNr   )rj   rk   rl   r   rZ   r  r  r  r   r=  r   r  r)   r(   r   r   r   r   r   r   r   rp   rq   s   @rh   r  r  d  s       {      *: : :8 8 8  
")":?:K"]b]n" " " "0  15481537+/5959$(34.
 .
E,-.
 u01.
 !.	.

 u/0.
 "%.
 !!12.
   12.
 D>.
 c5</0.
 .
 .
 ^ .
 .
 .
 .
 .
ri   r  c                   B    e Zd ZddgZdZdef fdZd Zd Zde	j
        d	e	j
        fd
Zee	 	 	 	 	 	 	 	 	 	 ddee	j                 dee	j                 dee	j
                 dee	j                 dee         dee	j                 dee	j                 dee	j                 dee         deee	j
        f         dee         fd                        Z	 	 	 	 	 	 d fd	Zde	j
        fdZe	j        	 	 	 d dee	j
                 dee	j                 dee         f fd            Z xZS )!JanusForConditionalGenerationz(model.language_model.embed_tokens.weightzlm_head.weightTr   c                    t                                          |           || _        t          |          | _        t          j        |j        j        |j        j	        d          | _
        |                                  d S )NFr   )rY   rZ   r   r  r   r   r   r   rO   
vocab_sizelm_headr  r	  s     rh   rZ   z&JanusForConditionalGeneration.__init__  sn       ''
y!3!?ASA^ejkkk 	ri   c                 >    | j         j                                        S r   )r   r  r  r  s    rh   r  z2JanusForConditionalGeneration.get_input_embeddings  s    z(==???ri   c                 D    | j         j                            |           d S r   )r   r  r  r  s     rh   r  z2JanusForConditionalGeneration.set_input_embeddings  s!    
!66u=====ri   inputsr   c                 n    | j                             |          }| j                             |          }|S r   )r   r  r  )re   r  rv  s      rh   'prepare_embeddings_for_image_generationzEJanusForConditionalGeneration.prepare_embeddings_for_image_generation  s2    z77??z44\BBri   Nr   r  r   r   r   r   r  r  labelsr  r  rf   c                 j    | j         d|||||||	|d|}|j        }t          |
t                    rt	          |
 d          n|
}|                     |dd|ddf                   }d}|  | j        d||| j        j        j	        d|}t          |||j        |j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r  r   r   r   r   r  r  r  N)logitsr  r  )lossr  r   r   r  r  rX   )r   rw  r   r   slicer  loss_functionr   r   r  r   r   r   r  r  )re   r  r   r   r   r   r  r  r  r  r  rf   outputsr   slice_indicesr  r  s                    rh   r   z%JanusForConditionalGeneration.forward  s   , $* 

%)%+')

 

 

 

  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4% f9P9[ _e D +#3!/) ' ;
 
 
 	
ri   c           	      j     t                      j        |f|||||d|}	|d         dk    r||	d<   |	S )N)r   r  r   r  r  r   r   )rY   prepare_inputs_for_generation)re   r  r   r   r   r  r  r  rf   model_inputsrg   s             rh   r  z;JanusForConditionalGeneration.prepare_inputs_for_generation"  se     =uww<
+')))
 
 
 
 !!!+7L(ri   r1  c                 t    | j         j                            |          }|                    dddd          }|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r-   r
   rx   )r   r  r  r8  )re   r1  decoded_images      rh   decode_image_tokensz1JanusForConditionalGeneration.decode_image_tokens@  s;     
*11,??%--aAq99ri   logits_processorc           	      z   |                     d| j                  }t          j        |          }|                     dd          }|dk    r t	                      j        d|||d d|S  |j        di |}|                                t          j	        t          j
        fvrt          d          |                                 |                     |                                           ||nt                      }d|d<   |j        !t                               d           d	|_        |j        |d
<   |                     ||j        |          \  }}	}|j        |j        }}
t-          |j                  dk    rt          d|j         d          |d u}|                     |||j                   |j        r9|j        dk    r.|                    t5          |j                             d |_        |                     ||j        d         |d ||          } | j        d|||j        d|\  }}| j        j        j         j!        }|j        \  }}|"                    dd          }|                     dd           }|"                    dd          }||d<   ||d d d f         |j        k    ||d d d f         |j#        d         k    z  }||d d d f         $                    ||j%                    | &                                |          }| '                    |||          }|(                    dd           <| )                    |j*        pd|dz  tW          |j,        ||z             |          |d<   t[          j.        ||f|
|          }|j/        }|j0        }|j1        }|j2        }|j3        }|r|rdnd }|r|rdnd }|r|rdnd }|r|rdnd }ti          |          D ]x} | j5        d||d|}|d         6                    |j                  |d<   |d         6                    |j                  |d<    | j        j7        di |||d}| 8                    ||          }|j9        d d dd d f         :                                } | j        ;                    |           }! |||!          }"|j<        r@t[          j=        |"d          }#t[          j>        |#d          ?                    d          }$nt[          j@        |"d          }$|$|d d |f<   t[          jA        |$|$g          }$|$B                    d          }$| C                    |$          }z|r:|r||!fz  }|r|| D                                fz  }|r
||jE        z  }|r
||jF        z  }|rt          ||!||||jH                  S |S ) Ngeneration_configgeneration_modetext)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r-   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  rx   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr   static)cache_implementationr   max_cache_lenmodel_kwargsr  rX   )r  r  r  )output_attentionsoutput_hidden_statesr   )r4  )num_samples)	sequencesscoresr  r  r   r   )Ipopr  copydeepcopyrY   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  r   warning_prepare_model_inputsbos_token_idr   r  ra  r   _prepare_special_tokensrg  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr   r  r   rd   repeatgeneration_kwargsmasked_fill_pad_token_idr  _get_initial_cache_positionr   
_get_cacher  max
max_lengthr   zerosr  r  output_scoresoutput_logitsreturn_dict_in_generater  r  r   r  #_update_model_kwargs_for_generationrw  cloner  	do_samplesoftmaxmultinomialsqueezeargmaxcatr  r  r   r  r   r   r   )&re   r  r   r  rf   r  r  r  r  model_input_namer   r  kwargs_has_attention_maskrd   r   r   input_tokensmaskr  generated_tokensr  r  r  r  r  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  rv  r  next_token_scoresprobs
next_tokenrg   s&                                        rh   r  z&JanusForConditionalGeneration.generateL  sd    #JJ':D<RSS M*;<< !**%6??f$$#577# -"3#	 
    0(/99&99 0022>;PR`Rn:oooT   	""$$$##L$5$5$7$7888 0@/K++QdQfQf %)[!+3NNrsss/0,):)I%& 594N4N%2L5
 5
1	#\ ")9vy1$$Fio F F F   %3$$>!$$%68QZcZj$kkk + 	40A0PST0T0T##$IJ[Jj$k$klll/3,  55/!*!3'%)- 6 
 
 #E$"D #
))>#
 #
 	#
 #
	<  :29J'o
G ''1--%))*:DAA'..q!44)7%& Z[[!!!^,0A0NNaaa(,=,OP^,__
 	Z[[!!!^$11$8I8VWWW31133LAA77VV-t44<.2oo%6%K%Wx%>!"3">@PSZ@Z[[) /> / /L*+ !;
4D'EU[abbb .?0E)7)7"3"K3PPRRD
3PPRRD
'> bCW b^b$;\@Q\RRX\'(( #	U #	UA=4= +| GS L .::J-K-N-N}Oc-d-dL)*-9:J-K-N-N}Oc-d-dL)*/dj/  "3%9   G  CCG\ZZL"4QQQAAAX>DDFFL Z//==F 0 0F C C !* E&7R@@@".u!DDDLLRPP

"\*;DDD
%/QQQT" J
#;<<J#--b11J HHTTMM" 	? (vi'
 6|113355
  9"g&88"# ?%)>>%" 
	$,*!-3 ' 7    $#ri   )
NNNNNNNNNr   )NNNNNN)NNN)rj   rk   rl   _tied_weights_keysr   r   rZ   r  r  r   r   r  r)   r(   r   r=  r   r   r   r   r   r%   r'   r   r  r  no_gradr   r  rp   rq   s   @rh   r  r    su       DFVW!{      @ @ @> > >el u|    
  15481537+/5959-1$(341
 1
E,-1
 u011
 !.	1

 u/01
 "%1
 !!121
   121
 )*1
 D>1
 c5</01
 +,1
 1
 1
 ^ 1
l      <
 
 
 
 
 ] *.59:>	|$ |$&|$ !!12|$ ##67	|$ |$ |$ |$ |$ ]|$ |$ |$ |$ |$ri   r  c            #       ,    e Zd ZdZdddej        dddddddfdedeee	e
f                  de
d	ed
edee
ef         dedeeeee         f                  deeeee         f                  dee         dee         f fdZ	 	 	 ddej        dee
ee
e
e
f         f         deee	ef                  deee	ef                  dej        f
dZej        ddfdej        deee	e
f         e
f         d	edeee	ef                  deee	ef                  dej        fdZ e            ddddddddddddej        dfdedee         deee	e
f                  d	ee         d
ee         dee         dee         deeeee         f                  deeeee         f                  deee	ef                  dee         deee
ee
e
e
f         f                  dee         dedeee	ef                  dej        j        f d            Z	 	 	 	 	 	 	 d ded
ee         dee         dee         deee                  deee                  dee	         dee	         fdZ	 d!dej        deeee         f         deeee         f         deee	ef                  dej        f
dZ xZS )"JanusImageProcessora  
    Constructs a JANUS image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
            `do_resize` parameter in the `preprocess` method.
        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        min_size (`int`, *optional*, defaults to 14):
            The minimum allowed size for the resized image. Ensures that neither the height nor width
            falls below this value after resizing.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
            overridden by the `resample` parameter in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
            overridden by the `rescale_factor` parameter in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
            overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the image to square or not.
    TN   gp?	do_resizer   min_sizeresample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbdo_padc                      t                      j        di | || _        || _        |	d| _        d S t          d |D                       | _        d S )N)   r:  r:  c              3   :   K   | ]}t          |d z            V  dS )   N)r   )r  xs     rh   	<genexpr>z/JanusImageProcessor.__init__.<locals>.<genexpr>J  s,      )K)K1#a#g,,)K)K)K)K)K)Kri   rX   )rY   rZ   r8  r0  background_colorrd  )re   r/  r   r0  r1  r2  r3  r4  r5  r6  r7  r8  rf   rg   s                rh   rZ   zJanusImageProcessor.__init__4  si     	""6""" $3D!!!$))K)K
)K)K)K$K$KD!!!ri   r   imager?  data_formatinput_data_formatr   c                 Z   t          ||          \  }}|t          j        k    r|j        d         n|j        d         }||k    r|t	          |||          n|}|S t          ||          }t          |t                    r|g}n&t          |          |k    rt          d| d          |t          j        k    ryt          j        |||f|j                  }	t          |          D ]\  }
}||	|
ddddf<   ||k    r||z
  dz  }||	dd|||z   ddf<   n||z
  dz  }||	dddd|||z   f<   nxt          j        |||f|j                  }	t          |          D ]\  }
}||	dddd|
f<   ||k    r||z
  dz  }||	|||z   ddddf<   n||z
  dz  }||	dd|||z   ddf<   |	S )a}  
        Pads an image to a square based on the longest edge.

        Args:
            image (`np.ndarray`):
                The image to pad.
            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                The color to use for the padding. Can be an integer for single channel or a
                tuple of integers representing for multi-channel images. If passed as integer
                in multi-channel mode, it will default to `0` in subsequent channels.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

        Returns:
            `np.ndarray`: The padded image.
        r   r   Nz(background_color must have no more than z) elements to match the number of channelsr   r-   )r   r   FIRSTr   r   r  r   r   ra  r   npr  r   	enumerate)re   r@  r?  rA  rB  r   r   rR   max_dimresultr&  colorstarts                rh   pad_to_squarez!JanusImageProcessor.pad_to_squareL  so   < 'u.?@@):>N>T)T)Tu{1~~Z_ZefhZiU?? * ,E;@QRRR 
 Lfe$$ &,, 	 01!""l22r<rrr    0 666X|Wg>ekRRRF%&677 ( (5"'q!!!QQQwv~~ 6)a/7<qqq%%&.0!!!344 5Q.6;qqq!!!UUU]2233Xw>ekRRRF%&677 ( (5"'qqq!!!Qwv~~ 6)a/7<uuv~-qqq!!!344 5Q.6;qqq%%%-/23ri   c                    |t          |          }t          ||          \  }}t          ||          }	t          |d          }|d         |d         k    r!t	          d|d          d|d                    |d         }||	z  }
t          t          ||
z            | j                  t          t          ||
z            | j                  g}t          |f||||d|}|S )	an  
        Resize an image to dynamically calculated size.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`dict[str, int]` or `int`):
                The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `None`: will be inferred from input
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        NTdefault_to_squarer   r   z5Output height and width must be the same. Got height=z and width=)r   r1  rA  rB  )r   r   r  r   r   r   r0  r   )re   r@  r   r1  rA  rB  rf   r   r   max_sizedeltaoutput_size_nonpaddeds               rh   r   zJanusImageProcessor.resize  s    F $ >u E E&u.?@@vu%%TT:::>T']**rXrrcghocprr   H~x FUN##T]33EEM""DM22!

 
&#/
 
 
 
 ri   imagesreturn_tensorsc           
      z   	 ||n j         }n j        ||n j        }n j        ||n j        }n j        		n j        	||n j        }||n j        }n j	        n j
        t          d                               |          }t          |          }t          |          st          d          t!          ||	|           |rd |D             }d |D             }|r/t#          |d                   rt$                              d	           t)          |d                   |r fd
|D             }|r fd|D             }|r fd|D             }|r	 fd|D             }fd|D             }t+          d|i|
          }|S )a`  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`dict[str, int]`, *optional*, defaults to `self.size`):
                Controls the size of the image after `resize`. The shortest edge of the image is resized to
                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image values between [0 - 1].
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to normalize the image by if `do_normalize` is set to `True`.
            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            background_color (`tuple[int, int, int]`):
                The background color to use for the padding.
            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
                Whether to pad the image to square or not.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        NFrM  zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)r2  r3  r4  r5  r6  r/  r   r1  c                 ,    g | ]}t          |          S rX   )r   r  r@  s     rh   r  z2JanusImageProcessor.preprocess.<locals>.<listcomp>?  s     @@@nU++@@@ri   c                 ,    g | ]}t          |          S rX   )r   rV  s     rh   r  z2JanusImageProcessor.preprocess.<locals>.<listcomp>B  s     <<<E.''<<<ri   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.c                 B    g | ]}                     |           S ))r@  r   r1  rB  )r   )r  r@  rB  r1  re   r   s     rh   r  z2JanusImageProcessor.preprocess.<locals>.<listcomp>O  s>        %dXYjkk  ri   c                 @    g | ]}                     |           S ))r@  r?  rB  )rK  )r  r@  r?  rB  re   s     rh   r  z2JanusImageProcessor.preprocess.<locals>.<listcomp>V  sI         ""%5&7 #    ri   c                 @    g | ]}                     |           S ))r@  r   rB  )rescale)r  r@  rB  r3  re   s     rh   r  z2JanusImageProcessor.preprocess.<locals>.<listcomp>`  s<        5Rcdd  ri   c                 B    g | ]}                     |           S )r@  meanstdrB  )r7  )r  r@  r5  r6  rB  re   s     rh   r  z2JanusImageProcessor.preprocess.<locals>.<listcomp>f  s>        U^opp  ri   c                 4    g | ]}t          |           S )input_channel_dim)r   )r  r@  rA  rB  s     rh   r  z2JanusImageProcessor.preprocess.<locals>.<listcomp>k  s7     
 
 
ej'{N_```
 
 
ri   r   datatensor_type)r/  r1  r2  r3  r4  r5  r6  r7  r8  r?  r   r   fetch_imagesr   r    r   r!   r   r   warning_oncer   r   )re   rR  r/  r   r1  r2  r3  r4  r5  r6  rS  r7  r?  r8  rA  rB  encoded_outputss   `  `` ` ``  ` `` rh   
preprocesszJanusImageProcessor.preprocess  s   L "+!6IIDN	'388#-#9ZZt
+9+E4K^'3'?||TEV#-#9ZZt
!*!6IIDN	+9+E4K^!-4;/?/K++QUQf'ttTYTU;;;""6**)&11F## 	:  
 	&!)%!		
 		
 		
 		
  	A@@@@@F =<V<<< 	/&)44 	s  
 $ >vay I I 	      #  F
  		      $  F  	     #  F
  	      #  F

 
 
 
 
nt
 
 
 '^V,DR`aaari   c	                 4   ||n| j         }|
d| j        z  n|}||n| j        }||n| j        }||n| j        }t          |          }t          |d         t          j        j                  rt          |          dk    r|n|d         S |t          |d                   }g }	|D ]}
t          |
          }
|r|                     |
|||          }
|rK|                     |
||          }
|
                    dd                              t           j                  }
|rC|rA|dk    r;t%          |
t&          j        |	          }
t          j                            |
          }
|	                    |
           d
|	i}|dk    r|nd}t/          ||          S )znApplies post-processing to the decoded image tokens by reversing transformations applied during preprocessing.Ng      ?r   rx   )r@  r5  r6  rB  )r   rB  r<  zPIL.Image.Imagera  r   rc  )r2  r3  r4  r5  r6  r   r   PILImagera  r   r   unnormalizer[  clipastyperE  uint8r   r   LAST	fromarrayrg  r   )re   rR  r2  r3  r4  r5  r6  rB  rS  r   r@  rd  s               rh   postprocesszJanusImageProcessor.postprocesss  s    $.#9ZZt
6D6Lt222R`'3'?||TEV#-#9ZZt
!*!6IIDN	)&11fQi11 	< [[1__66&);$ >vay I I 	' 	'E"5))E ((J)_p )    <U.Teff

1c**11"(;; 3
 3~AR/R/R3E;K;Pduvvv	++E22&&&&-+9=N+N+NTX>BBBBri   c                     d}t          |t                    r6t          |          |k    r"t          d| dt          |                     n|g|z  }t          |t                    r6t          |          |k    r"t          d| dt          |                     n|g|z  }t	          d t          ||          D                       }t	          d |D                       }|                     ||||          }|S )a~  
        Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
        image = (image * image_std) + image_mean
        Args:
            image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
                Batch of pixel values to postprocess.
            image_mean (`float` or `Iterable[float]`):
                The mean to use for unnormalization.
            image_std (`float` or `Iterable[float]`):
                The standard deviation to use for unnormalization.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        r
   zmean must have z$ elements if it is an iterable, got zstd must have c              3   (   K   | ]\  }}| |z  V  d S r   rX   )r  r^  r_  s      rh   r>  z2JanusImageProcessor.unnormalize.<locals>.<genexpr>  s,      WWytSus{WWWWWWri   c              3       K   | ]	}d |z  V  
dS )rx   NrX   )r  r_  s     rh   r>  z2JanusImageProcessor.unnormalize.<locals>.<genexpr>  s&      ;;#a#g;;;;;;ri   r]  )r   r   ra  r   rd  zipr7  )re   r@  r5  r6  rB  rR   rev_image_meanrev_image_stds           rh   rm  zJanusImageProcessor.unnormalize  s2   0 j(++ 	5:,.. !v<!v!vehisetet!v!vwww / %4Ji** 	39~~-- !t,!t!tdghqdrdr!t!tuuu . #l2IWWC
I<V<VWWWWW;;;;;;;n-Sd  
 
 ri   )r   NN)NNNNNNNr   ) rj   rk   rl   rm   r   BICUBICr   r   r   strr   r   r   r   rZ   rE  ndarrayrd  r   rK  r   r*   rD  r   r&   rk  rl  ri  rs  r   rm  rp   rq   s   @rh   r-  r-    s6       % %R )-'9'A,3!:>9=)-!%L LL tCH~&L 	L
 %L L c5j)L L U5$u+#567L E%e"456L !L L L L L L L6 >?>BDHH HzH  U3S=%9 9:H eC)9$9:;	H
 $E#/?*?$@AH 
H H H H\ (:'A>BDH? ?z? DcNC'(? %	?
 eC)9$9:;? $E#/?*?$@A? 
? ? ? ?B %$&& %))-15%)*.'+:>9=;?)-GK!%(8(>DH!Y YY D>Y tCH~&	Y
 -.Y TNY !Y tnY U5$u+#567Y E%e"456Y !sJ!78Y !Y #5eCcM.B)B#CDY Y &Y  $E#/?*?$@A!Y" 
#Y Y Y '&Y| &**.'+,0+/+/(,1C 1C1C TN1C !	1C
 tn1C T%[)1C DK(1C $C=1C !1C 1C 1C 1Cp EI+ +z+ %%01+ /0	+
 $E#/?*?$@A+ 
+ + + + + + + +ri   r-  )	r-  r   r  r  r  r  rs   r@   r   )|r  collections.abcr   dataclassesr   typingr   r   r   numpyrE  r   torch.nn.functionalr   
functionalr6  torch.utils.checkpoint.transformers.models.blip.image_processing_blipr	   activationsr   cache_utilsr   configuration_utilsr   
generationr   r   r   r   generation.utilsr   image_processing_utilsr   r   image_transformsr   r   r   image_utilsr   r   r   r   r   r   r   r   r    r!   modeling_outputsr"   modeling_utilsr#   r$   processing_utilsr%   utilsr&   r'   r(   r)   r*   r+   r,   autor.   r/   r0   blip_2.modeling_blip_2r1   !chameleon.configuration_chameleonr2   chameleon.modeling_chameleonr3   r4   r5   r6   r7   idefics.modeling_ideficsr8   r9   llama.modeling_llamar:   siglip.configuration_siglipr;   siglip.modeling_siglipr<   r=   r>   rk  
get_loggerrj   r   r@   rs   r   r   r   r   r   r   rh  r   r  r   r  r  r#  r.  r?  rA  rC  rE  rS  r\  ry  r  r  r  r  r  r-  __all__rX   ri   rh   <module>r     s	     $ $ $ $ $ $ ! ! ! ! ! ! , , , , , , , , , ,                         M M M M M M ! ! ! ! ! !             3 3 3 3 3 3 u u u u u u u u u u u u 9 9 9 9 9 9 A A A A A A A A S S S S S S S S S S                        , + + + + + F F F F F F F F & & & & & &                  9 8 8 8 8 8 8 8 8 8 5 5 5 5 5 5 D D D D D D              e d d d d d d d : : : : : : < < < < < < ^ ^ ^ ^ ^ ^ ^ ^ ^ ^  JJJ		H	%	%
^1 ^1 ^1 ^1 ^1* ^1 ^1 ^1BW W W W W+ W W Wtk# k# k# k# k#" k# k# k#\ 
. 
. 
. 
. 
.? 
. 
. 
.   
	7 	7 	7 	7 	7{ 	7 	7  	7	 	 	 	 	#A 	 	 		 	 	 	 	"? 	 	 	    2   "I$ I$ I$ I$ I$29 I$ I$ I$X    RY   (* * * * *0 * * *p p p p p p p p2 2 2 2 2' 2 2 2    BI   $" " " " " = " " "*	 	 	 	 	< 	 	 		 	 	 	 	8 	 	 		 	 	 	 	B 	 	 	    RY          ,J! J! J! J! J!	 J! J! J!ZA A A A A	 A A AH-F -F -F -F -F -F -F -F`    29   $    RY       
i
 i
 i
 i
 i
% i
 i
 
i
Xt$ t$ t$ t$ t$$8/ t$ t$ t$n	E E E E E, E E EP
 
 
ri   