
     `i                     ~   d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	c m
Z d dlm	Z	 ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/  e&j0        e1          Z2e$ G d de                      Z3e e$d           G d de                                  Z4e e$d           G d de                                  Z5e e$d           G d de                                  Z6 G d  d!e	j7                  Z8d"ej9        d#e:d$ej9        fd%Z;	 d_d'e	j7        d(ej9        d)ej9        d*ej9        d+eej9                 d,e<d-e<d.e!e#         fd/Z= G d0 d1e	j7                  Z> G d2 d3e	j7                  Z? G d4 d5e          Z@ G d6 d7e	j7                  ZA G d8 d9e	j7                  ZB G d: d;e	j7                  ZC G d< d=e          ZDe$ G d> d?e3                      ZE G d@ dAe	j7                  ZF G dB dCe	j7                  ZG G dD dEe	j7                  ZH G dF dGe	j7                  ZI G dH dIe	j7                  ZJ G dJ dKe	j7                  ZK G dL dMe	j7                  ZL G dN dOe	j7                  ZM G dP dQe	j7                  ZN e$dR           G dS dTe3                      ZO G dU dVe	j7                  ZP G dW dXe	j7                  ZQ e$dY           G dZ d[e3                      ZR G d\ d]e3e          ZSg d^ZTdS )`    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)check_model_inputs   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   B    e Zd ZU eed<   dZdZddgZddgZdZ	dZ
dZdZd	S )
JanusPreTrainedModelconfigmodelTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskFN)__name__
__module____qualname__r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph!_supports_param_buffer_assignment     |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/janus/modeling_janus.pyr$   r$   /   sX         &*#,.GH#4m"DN!(-%%%r8   r$   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   \    e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dS )JanusVQVAEOutputz
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)
r+   r,   r-   __doc__r=   r   torchFloatTensorr.   r>   r7   r8   r9   r<   r<   =   sO           9=(5#45<<<26NHU./66666r8   r<   zy
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                       e Zd ZU dZdZeej                 ed<   dZ	ee
         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dS )JanusBaseModelOutputWithPasta  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_stater)   hidden_states
attentionsimage_hidden_states)r+   r,   r-   r?   rD   r   r@   rA   r.   r)   r
   rE   tuplerF   rG   r7   r8   r9   rC   rC   O   s          & 6:x 12999'+OXe_+++8<M8E%"345<<<59Ju012999>B%(9":;BBBBBr8   rC   zQ
    Base class for Janus causal language model (or autoregressive) outputs.
    c                      e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dS )	JanusCausalLMOutputWithPastae  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr)   rE   rF   rG   )r+   r,   r-   r?   rK   r   r@   rA   r.   rL   r)   r
   rE   rH   rF   rG   r7   r8   r9   rJ   rJ   p   s          " )-D(5$
%,,,*.FHU&'...'+OXe_+++8<M8E%"345<<<59Ju012999>B%(9":;BBBBBr8   rJ   c                   z     e Zd Zdef fdZdej        dededej        fdZdd	ej        d
e	dej        fdZ
 xZS )JanusVisionEmbeddingsr%   c                    t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        |j	        | j        | j        | j        d          | _
        | j        | j        z  dz  | _        | j        | _        t          j        | j        | j                  | _        |                     dt!          j        | j                                      d          d           d S )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r   F)
persistent)super__init__r%   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr@   arangeexpandselfr%   	__class__s     r9   rZ   zJanusVisionEmbeddings.__init__   s    + + +!y+? 
  
  
 !Ot>1D!-"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr8   
embeddingsheightwidthreturnc                 ~   |j         d         }| j        j        j         d         }t          j                                        s&||k    r ||k    r|                     | j                  S | j        j                            d          }|j         d         }|| j        z  }|| j        z  }	t          |dz            }
|
                    d|
|
|          }|                    dddd          }t          j                            |||	fdd	          }|                    dddd                              dd|          }|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   rW   g      ?r   r   bicubicF)sizemodealign_corners)shapere   weightr@   jit
is_tracingrV   	unsqueezer^   r   reshapepermuter   
functionalinterpolateview)rj   rl   rm   rn   rb   rc   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r9   interpolate_pos_encodingz.JanusVisionEmbeddings.interpolate_pos_encoding   sL    !&q)/6<Q? y##%% 	>+*F*F6UZ??**4+<===18BB1EEr"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNr8   Fpixel_valuesr   c                 V   |j         \  }}}}| j        j        j        }|                     |                    |                    }|                    d                              dd          }|r|                     |||          }	n|                     | j	                  }	||	z   }|S )N)dtyper   r   )
ru   ra   rv   r   toflatten	transposer   re   rV   )
rj   r   r   _rm   rn   target_dtypepatch_embedsrl   
pos_embedss
             r9   forwardzJanusVisionEmbeddings.forward   s    *01fe+28++LOO,O,O,OPP!))!,,66q!<<
# 	D66z65QQJJ001BCCJ*,
r8   )F)r+   r,   r-   r!   rZ   r@   Tensorintr   boolr   __classcell__rk   s   @r9   rN   rN      s        q0 q q q q q q($5< $ $UX $]b]i $ $ $ $L EL D ]b]i        r8   rN   rE   n_repro   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)ru   rh   rz   )rE   r   batchnum_key_value_headsslenhead_dims         r9   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr8           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr   r   rW   )r   r   )ptrainingr   )r   num_key_value_groupsr@   matmulr   ru   r   r|   softmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsr*   attn_outputs                r9   eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r8   c                   l     e Zd ZdZdef fdZ	 d	dej        deej                 de	e
         fdZ xZS )
JanusVisionAttentionz(Attention Class for Janus Vision Encoderr%   c                 6   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _
        |j        }|j        }d| _        d| _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j                  | _        |dk    rt          j        |          nt          j                    | _        |rt          j        | j                  nt          j                    | _        |rt          j        | j                  nt          j                    | _        d S )	N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: ).      Fr   biasr   )rY   rZ   r%   r[   r\   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)rj   r%   proj_dropoutqk_normrk   s       r9   rZ   zJanusVisionAttention.__init__  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
!'!90$ %&!i0NU[Ujkkki0NU[Ujkkki0NU[Ujkkk "	$.$. I I>JQ>N>N"*\":":":TVT_TaTa6=Pbl4>2222;==6=Pbl4>2222;==r8   NrE   r   r   c                    |                                 \  }}}|                     |          }|                     |          }|                     |          }	|                    d| j        | j                  }|                     |          }|                    d| j        | j                  }|                     |          }|                    ||| j        | j                  	                    dd          }|                    ||| j        | j                  	                    dd          }|	
                    ||| j        | j                  	                    dd          }	t          }
| j        j        dk    rt          | j        j                 }
 |
| |||	|f| j        sdn| j        | j        | j        d|\  }}|                    ||| j                  }|                     |          }|                     |          }||fS )NrW   r   r   eagerr   )r   r   r   )rr   r   r   r   rz   r   r   r   r   r   r~   r   r%   _attn_implementationr   r   r   r   r   r\   r   r   )rj   rE   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputs                 r9   r   zJanusVisionAttention.forward!  s    "/!3!3!5!5
GQ{{=11[[//
{{=11#++BNN{{<00''DNDMJJ
[[,,
#++JQUQ^__iijkmnoo''
GT^T][[eefgijkk
#((Wdndm\\ffghjkll(?;+w66"9$+:Z"[$7$7
%
  $}HCC$2HJn
%
 
%
 
%
 
%
!\ "))*gt~NN&&{33((00|##r8   N)r+   r,   r-   r?   r!   rZ   r@   r   r   r   r   r   r   r   s   @r9   r   r     s        22Q0 Q Q Q Q Q Q@ 26)$ )$|)$ !.)$ +,	)$ )$ )$ )$ )$ )$ )$ )$r8   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )JanusVisionMLPr%   c                    t                                                       || _        t          |j        |j        z            | _        t          |j                 | _	        t          j        |j        | j                  | _        t          j        | j        |j                  | _        t          j        |j                  | _        t          j        |j                  | _        d S r   )rY   rZ   r%   r   r[   	mlp_ratiointermediate_sizer	   
hidden_actactivation_fnr   r   fc1fc2r   hidden_dropout_ratedropout1dropout2ri   s     r9   rZ   zJanusVisionMLP.__init__N  s    !$V%7&:J%J!K!K#F$569V/1GHH9T3V5GHH
6#=>>
6#=>>r8   rE   ro   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r   )r   r   r   r   r   rj   rE   s     r9   r   zJanusVisionMLP.forwardX  s_    //**=99m44//m44r8   )	r+   r,   r-   r!   rZ   r@   r   r   r   r   s   @r9   r   r   M  sk        ?0 ? ? ? ? ? ?U\ el        r8   r   c            	       v     e Zd Zdef fdZedej        dej        dee	         dej
        fd            Z xZS )r(   r%   c                 R   t                                                       |j        | _        t	          j        | j        |j                  | _        t          |          | _	        t	          j        | j        |j                  | _
        t          |          | _        || _        d S N)eps)rY   rZ   r[   r\   r   r   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr%   ri   s     r9   rZ   z JanusVisionEncoderLayer.__init__b  s    +<F<QRRR-f55<F<QRRR!&))r8   rE   r   r   ro   c                     |}|                      |          } | j        d||d|\  }}||z   }|}|                     |          }|                     |          }||z   }|S )N)rE   r   r7   r   r   r   r   rj   rE   r   r   residualr   s         r9   r   zJanusVisionEncoderLayer.forwardk  s     !((77)4> 
')
 
 
 
q
 !=0 ((77// =0r8   )r+   r,   r-   r!   rZ   r   r@   r   r   r   rA   r   r   r   s   @r9   r(   r(   a  s        0       |  +,	
 
	   ^    r8   r(   c                   r     e Zd ZdZdef fdZe	 d	deej	                 de
e         defd            Z xZS )
JanusVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`JanusVisionEncoderLayer`].

    Args:
        config: JanusVisionConfig
    r%   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r7   )r(   .0r   r%   s     r9   
<listcomp>z/JanusVisionEncoder.__init__.<locals>.<listcomp>  s"    $n$n$n%<V%D%D$n$n$nr8   F)	rY   rZ   r%   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingri   s    `r9   rZ   zJanusVisionEncoder.__init__  sa    m$n$n$n$neTZTlNmNm$n$n$noo&+###r8   Nr   r   ro   c                 N    |}| j         D ]} |||fi |}t          |          S )N)rD   )r   r   )rj   inputs_embedsr   r   rE   encoder_layers         r9   r   zJanusVisionEncoder.forward  sU     &![ 	 	M)M   MM ????r8   r   )r+   r,   r-   r?   r!   rZ   r   r   r@   r   r   r   r   r   r   r   s   @r9   r   r     s         ,0 , , , , , ,  26@ @ !.@ +,	@
 
@ @ @ ^@ @ @ @ @r8   r   c                        e Zd ZdZ fdZdej        dedefdZ	 ddej        d	e	ej                 d
e
ej        e	ej                 e	e
ej                          f         fdZ xZS )JanusAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 V   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        d| _
        |j        | _        t          j        | j        d| j        z  d          | _        |j        rWt          j        t#          j        | j                            }t          j        t#          j        | j                            }nd }d }|It#          j        |t#          j        |d          |f          }t          j        |          | j        _        t          j        | j        | j                  | _        d S )	Nr   r   r   r   Fr   r   )requires_grad)rY   rZ   r%   r[   r\   r   r   r   r   r   r   r   r   r   qkvqkv_bias	Parameterr@   zeroscat
zeros_liker   
projection)rj   r%   q_biasv_biasr	  rk   s        r9   rZ   zJanusAttention.__init__  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
!'!9 9T^Q-?eLLL? 	\%+dn"="=>>F\%+dn"="=>>FFFFy&%*:6QV*W*W*WY_!`aaHL22DHM)DNDNCCr8   tensorr   bszc                     |                     ||| j        | j                                      dd                                          S )Nr   r   )r~   r   r   r   r   )rj   r  r   r  s       r9   _shapezJanusAttention._shape  s<    {{3GGQQRSUVWWbbdddr8   NrE   	head_maskro   c                 2   |                                 \  }}}|                     |          }|                    ||d| j        || j        z                                ddddd          }|d         |d         |d         }
}	}t
          }| j        j        dk    rt          | j        j                 } || ||	|
fd| j	        sdn| j
        | j        d	|\  }}|                    ||d
                                          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr   r   r   r      r   Nr   )r   r   r   rW   )rr   r  rz   r   r{   r   r%   r   r   r   r   r   r   r  )rj   rE   r  r   r  tgt_lenr\   	mixed_qkvr   r   r   r   r   r   s                 r9   r   zJanusAttention.forward  sC    #0"4"4"6"6WiHH]++	%%c7At~yTXTbGbcckkq!Q
 
	 2;1y|YWX\,j(?;+w66"9$+:Z"[$7$7		%

  #}HCC$2HJ	%
 	%
 	%
 	%
!\ "))#w;;FFHHook22L((r8   r   )r+   r,   r-   r?   rZ   r@   r   r   r  r   rH   r   r   r   s   @r9   r  r    s        GGD D D D D>eU\ eC ec e e e e -1$) $)|$) EL)$)
 
u|Xel3XeEL>Q5RR	S$) $) $) $) $) $) $) $)r8   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )JanusMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S r   )rY   rZ   r%   r	   r   r   r   r   r[   r   r   r   ri   s     r9   rZ   zJanusMLP.__init__  sf    #F$569V/1IJJ9V5v7IJJr8   rE   ro   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   r   s     r9   r   zJanusMLP.forward  s=    //**=99//r8   )r+   r,   r-   rZ   r@   r   r   r   r   s   @r9   r  r    sc        K K K K KU\ el        r8   r  c            	       v     e Zd Zdef fdZedej        dej        dee	         dej
        fd            Z xZS )JanusEncoderLayerr%   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S r   )rY   rZ   r[   r\   r  r   r   r   r   r   r  r   r   ri   s     r9   rZ   zJanusEncoderLayer.__init__  s    +'//<F<QRRRF##<F<QRRRr8   rE   r   r   ro   c                     |}|                      |          } | j        d||d|\  }}||z   }|}|                     |          }|                     |          }||z   }|S )N)rE   r  r7   r   r   s         r9   r   zJanusEncoderLayer.forward
  s     !((77)4> 
'$
 
 
 
q
 &0 ((77//%0r8   )r+   r,   r-   r    rZ   r   r@   r   r   r   rA   r   r   r   s   @r9   r  r    s        S{ S S S S S S |  +,	
 
	   ^    r8   r  c                        e Zd ZU dZeed<   eedZdef fdZ	 e
d          e	 	 ddeej                 ded	ee         d
eeef         fd                        Zd Z xZS )JanusVisionModelr   r%   )rE   rF   c                    t                                          |           || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        |                                  d S r   )rY   rZ   r%   r[   rN   rl   r   encoderr   r   r   post_layernorm	post_init)rj   r%   r\   rk   s      r9   rZ   zJanusVisionModel.__init__,  sx       &	/77)&11 l9&:OPPPr8   F)tie_last_hidden_statesNr   r   ro   c                    |t          d          |                     ||          } | j        dd|i|}|j        }|                     |          }|d d dd d f         }|                     |          }t          ||          S )Nz You have to specify pixel_values)r   r  r   )rD   pooler_outputr7   )r   rl   r%  rD   r&  r   )rj   r   r   r   rE   encoder_outputsrD   pooled_outputs           r9   r   zJanusVisionModel.forward7  s     ?@@@Oghh+74< ,
 ,
',
,
 ,

 ,= //0ABB)!!!Q'2++M::)/'
 
 
 	
r8   c                     | j         S r   )rl   rj   s    r9   get_input_embeddingsz%JanusVisionModel.get_input_embeddingsT  s
    r8   NF)r+   r,   r-   main_input_namer!   r.   r  r  _can_record_outputsrZ   r   r   r   r@   rA   r   r   r   r   rH   r   r   r/  r   r   s   @r9   r#  r#  #  s         $O*$ 
	0 	 	 	 	 	 	 u555 59).
 
u01
 #'
 +,	

 
u00	1
 
 
 ^ 65
6      r8   r#  c                   *     e Zd Zdef fdZd Z xZS )JanusVisionAlignerMLPr%   c                 0   t                                                       t          j        j        j                  | _        t          j        fdt          dj	                  D                       | _
        t          j                 | _        d S )Nc                 N    g | ]!}t          j        j        j                  "S r7   r   r   projection_dimr   s     r9   r   z2JanusVisionAlignerMLP.__init__.<locals>.<listcomp>^  s+    eeeRYv,f.CDDeeer8   r   )rY   rZ   r   r   r[   r8  r   r   r   depthhidden_layersr	   r   r   ri   s    `r9   rZ   zJanusVisionAlignerMLP.__init__Y  s    9V/1FGG]eeeeeTUW]WcNdNdeee
 
 $F$56r8   c                     |                      |          }| j        D ]"}|                     |          } ||          }#|S r   r   r:  r   rj   rE   layers      r9   r   zJanusVisionAlignerMLP.forwardb  O    //' 	1 	1E ..}==M!E-00MMr8   )r+   r,   r-   r!   rZ   r   r   r   s   @r9   r4  r4  X  sT        70 7 7 7 7 7 7      r8   r4  c                   b     e Zd ZdZdef fdZdej        fdZdej	        dej
        fdZ xZS )	JanusVQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r%   c                    t                                                       |j        | _        |j        | _        t          |dd          | _        t          j        | j        | j                  | _	        |j
        gdz  | _        d S )Nbetag      ?r   )rY   rZ   num_embeddingsr\   embedding_dimgetattrrC  r   rd   	embeddingrb   quant_state_dimsri   s     r9   rZ   z"JanusVQVAEVectorQuantizer.__init__u  st    $3#-FFD11	d&94;MNN!'!3 4q 8r8   hidden_statec           
      R   |                     dddd                                          }|                    d| j                  }t	          j        |dz  dd          t	          j        | j        j        dz  d          z   dt	          j        d	|| j        j        	                    dd                    z  z
  }t	          j
        |d          }|                     |                              |j                  }t	          j        |                                |z
  dz            | j        t	          j        ||                                z
  dz            z  z   }|||z
                                  z   }|                     dddd                                          }|||fS )
Nr   r   r   r   rW   T)r   keepdimr   z	bd,dn->bn)r{   r   r~   rE  r@   sumrG  rv   einsumr   argminru   meandetachrC  )rj   rI  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantrK   s          r9   r   z!JanusVQVAEVectorQuantizer.forward~  s   #++Aq!Q77BBDD!-!2!22t7I!J!J I,a/QEEEi-q0a8889%,{,BDNDYDcDcdeghDiDijjjk 	  %|I1===!^^,@AAFF|GYZZ z-4466E!KLLty[`[e,"5"5"7"77A=\
 \
 P
 

 *-?,-N,V,V,X,XX 0771aCCNNPP!4)===r8   image_tokensro   c                 :   |j         d         }| j        j        j         d         }|                     |          }t          j        |dd          }|                    |g| j        |R           }|                    dddd                                          }|S )Nr   rW   r   )r   r   r   r   )	ru   rG  rv   F	normalizer~   rH  r{   r   )rj   rV  r   emb_dimrU  s        r9   get_codebook_entryz,JanusVQVAEVectorQuantizer.get_codebook_entry  s    !'*
~,226 "^^L99[);qbIII 044j5b4CX5bZa5b5bcc/771aCCNNPP!!r8   )r+   r,   r-   r?   r"   rZ   r@   r   r   
LongTensorrA   r[  r   r   s   @r9   rA  rA  j  s         9/ 9 9 9 9 9 9>EL > > > >6"u/? "EDU " " " " " " " "r8   rA  c                   *     e Zd Z	 	 d fd	Zd Z xZS )JanusVQVAEResnetBlockNFc                    t                                                       || _        ||n|| _        || _        t
          j                            d|dd          | _        t
          j        	                    ||ddd          | _
        t
          j                            d|dd          | _        t
          j                            |j                  | _        t
          j        	                    ||ddd          | _        | j        | j        k    r]| j        r+t
          j        	                    ||ddd          | _        d S t
          j        	                    ||ddd          | _        d S d S )	N    ư>T
num_groupsr`   r   affiner   r   rS   rT   rU   r   )rY   rZ   rQ   rR   use_conv_shortcutr@   r   	GroupNormnorm1r_   conv1norm2r   r   conv2conv_shortcutnin_shortcut)rj   r%   rQ   rR   rl  rk   s        r9   rZ   zJanusVQVAEResnetBlock.__init__  sR    	&+7+?KK\!.X''2KUYbf'gg
X__[,AVWab_cc
X''2LVZcg'hh
x''77X__\<QWXbc_dd
t000% s%*X__[,\]fgqr_%s%s"""$)HOOK[\efpqO$r$r!!!	 10r8   c                    |}|                      |          }|t          j        |          z  }|                     |          }|                     |          }|t          j        |          z  }|                     |          }|                     |          }| j        | j        k    r2| j	        r| 
                    |          }n|                     |          }||z   S r   )rh  r@   sigmoidri  rj  r   rk  rQ   rR   rf  rl  rm  )rj   rE   r   s      r9   r   zJanusVQVAEResnetBlock.forward  s     

=11}555

=11

=11}555]33

=11t000% 7--h77,,X66-''r8   r0  r+   r,   r-   rZ   r   r   r   s   @r9   r^  r^    sZ        
 s s s s s s.( ( ( ( ( ( (r8   r^  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEAttnBlockc                    t                                                       || _        t          j                            d|dd          | _        t          j                            ||ddd          | _        t          j                            ||ddd          | _	        t          j                            ||ddd          | _
        t          j                            ||ddd          | _        d S )Nr`  ra  Trb  r   r   re  )rY   rZ   rQ   r@   r   rg  normr_   qkvproj_outrj   rQ   rk   s     r9   rZ   zJanusVQVAEAttnBlock.__init__  s    &H&&";TXae&ff	kqQR\]^^kqQR\]^^kqQR\]^^[aXYcdeer8   c                    |}|                      |          }|                     |          }|                     |          }|                     |          }|j        \  }}}}	|                    ||||	z                                ddd          }|                    ||||	z            }t          j        ||          }
|
t          |          dz  z  }
t          j        |
d          }
|                    ||||	z            }|
                    ddd          }
t          j        ||
                              ||||	          }|                     |          }||z   S )Nr   r   r   r   rL  )rt  ru  rv  rw  ru   rz   r{   r@   bmmr   rX  r   rx  )rj   rE   r   r   r   r   r   channelsrm   rn   r   r   s               r9   r   zJanusVQVAEAttnBlock.forward  s[    		-00vvm,,VVM**
vvm,, /;.@+
Hfe#++J&5.QQYYZ[]^`abb''
HfunMM
yz::#s8}}'>?y1555 $++J&5.QQ#++Aq!44il;;CCJPXZ`bghhmmK00+%%r8   rp  r   s   @r9   rr  rr    sL        f f f f f& & & & & & &r8   rr  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvDownsamplec                     t                                                       t          j        ||ddd          | _        d S )Nr   r   r   re  )rY   rZ   r   r_   convry  s     r9   rZ   z!JanusVQVAEConvDownsample.__init__  s:    Ik;AaYZ[[[			r8   c                 `    t          j        |ddd          }|                     |          }|S )N)r   r   r   r   constantr   )padrs   r   )rX  r  r  r   s     r9   r   z JanusVQVAEConvDownsample.forward  s2    mJVWXXX		-00r8   rp  r   s   @r9   r~  r~    sL        \ \ \ \ \      r8   r~  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvUpsamplec                     t                                                       t          j                            ||ddd          | _        d S )Nr   r   re  )rY   rZ   r@   r   r_   r  ry  s     r9   rZ   zJanusVQVAEConvUpsample.__init__  s>    HOOK!TU_`Oaa			r8   c                 ^    t          j        |dd          }|                     |          }|S )Ng       @nearest)scale_factorrs   )rX  r}   r  r   s     r9   r   zJanusVQVAEConvUpsample.forward	  s/    m#IVVV		-00r8   rp  r   s   @r9   r  r    sL        b b b b b      r8   r  c                   L     e Zd Zdedef fdZdej        dej        fdZ xZ	S )JanusVQVAEMidBlockr%   r|  c                     t                                                       t          |||          | _        t	          |          | _        t          |||          | _        d S )Nr%   rQ   rR   )rY   rZ   r^  block_1rr  attn_1block_2)rj   r%   r|  rk   s      r9   rZ   zJanusVQVAEMidBlock.__init__  sl    , !
 
 

 *(33, !
 
 
r8   rE   ro   c                     |                      |          }|                     |          }|                     |          }|S r   )r  r  r  r   s     r9   r   zJanusVQVAEMidBlock.forward  s;    ]33M22]33r8   )
r+   r,   r-   r"   r   rZ   r@   r   r   r   r   s   @r9   r  r    sr        
/ 
3 
 
 
 
 
 
U\ el        r8   r  c                   4     e Zd Z fdZdej        fdZ xZS )JanusVQVAEEncoderc           	         t                                                       t          |j                  | _        |j        | _        |j        }|j        }|j        }|j	        }|j        }t          j                            ||ddd          | _        dt          |          z   }|| _        t          j                    | _        t%          | j                  D ]
}t          j                    }	t          j                    }
|||         z  }|||         z  }t%          | j                  D ]Y}|	                    t)          |||                     |}|| j        dz
  k    r"|
                    t+          |                     Zt          j                    }|	|_        |
|_        || j        dz
  k    rt3          |          |_        | j                            |           t7          ||          | _        t          j                            d|dd	          | _        t          j                            ||rd
|z  n|ddd          | _        d S )Nr   r   re  )r   r  r`  ra  Trb  r   ) rY   rZ   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrQ   double_latentlatent_channelsr@   r   r_   conv_inrH   in_channel_multiplierr   downr   appendr^  rr  Moduleblockattnr~  
downsampler  midrg  norm_outconv_out)rj   r%   r  rQ   r  r  r  r  i_levelr  r  block_in	block_outi_blockr  rk   s                  r9   rZ   zJanusVQVAEEncoder.__init__&  s@   "6#<==$3,(, 0#6x{MqYZdeff $u-?'@'@ @%:"MOO	T122 	# 	#GMOOE=??D$'<W'EEH%(:7(CCI !455 
? 
?)%$,%.     %d2Q666KK 3H = =>>>9;;DDJDI$.222":8"D"DIT""""%fh77**bxUYbf*gg#0EAo ( 
 
r8   r   c                    |                      |          g}t          | j                  D ]}t          | j                  D ]} | j        |         j        |         |d                   }t          | j        |         j                  dk    r! | j        |         j        |         |          }|                    |           || j        dz
  k    r9|                    | j        |         	                    |d                              |d         }| 
                    |          }|                     |          }|t          j        |          z  }|                     |          }|S )NrW   r   r   )r  r   r  r  r  r  r  r  r  r  r  r  r@   ro  r  )rj   r   rE   r  r  rI  rD   s          r9   r   zJanusVQVAEEncoder.forwardY  so   l334T122 		W 		WG !455 3 3@ty17@!"%    ty).//!33#C49W#5#:7#CL#Q#QL$$\2222$.222$$TYw%7%B%B=QSCT%U%UVVV *"- HH%677 !MM*;<<U]+<=== MM*;<<  r8   )r+   r,   r-   rZ   r@   r\  r   r   r   s   @r9   r  r  %  sW        1
 1
 1
 1
 1
f!E$4 ! ! ! ! ! ! ! !r8   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )JanusVQVAEDecoderc           	      z   t                                                       t          |j                  | _        |j        | _        |j        }|j        }|j        }||j        | j        dz
           z  }t          j
                            ||ddd          | _        t          ||          | _        t          j                    | _        t#          t%          | j                            D ]}t          j                    }t          j                    }||j        |         z  }	t%          | j        dz             D ]Y}
|                    t)          |||	                     |	}|| j        dz
  k    r"|                    t+          |                     Zt          j                    }||_        ||_        |dk    rt3          |          |_        | j                            |           t          j
                            d|dd	          | _        t          j
                            ||ddd          | _        d S )
Nr   r   re  r  r   r`  ra  Trb  )rY   rZ   r  r  r  r  r  r  rR   r@   r   r_   r  r  r  r   upreversedr   r  r^  rr  r  r  r  r  upsamplerg  r  r  )rj   r%   r  r  rR   r  r  r  r  r  r  r  rk   s               r9   rZ   zJanusVQVAEDecoder.__init__s  s   "6#<==$3, 0* !6#<T=QTU=U#VV xaXYcdee &fh77 -//d&: ; ;<< 	 	GMOOE=??D%(A'(JJI !4q!899 
? 
?)%$,%.     %d2Q666KK 3H = =>>>BBHBG!||4X>>GNN2 **bxUYbf*gg,AVWabccr8   rI  ro   c                 d   |                      |          }|                     |          }t          | j                  D ]}t          | j        dz             D ]g} | j        |         j        |         |          }t          | j        |         j                  dk    r! | j        |         j        |         |          }h|| j        dz
  k    r | j        |         	                    |          }| 
                    |          }|t          j        |          z  }|                     |          }|S )Nr   r   )r  r  r   r  r  r  r  r  r  r  r  r@   ro  r  )rj   rI  r  r  s       r9   r   zJanusVQVAEDecoder.forward  s)   ||L11 xx-- T122 	G 	GG !4q!899 P P>tww/5g>|LLtww',--11#A477#3#8#A,#O#OL$.222#ww/88FF}}\22l333}}\22r8   )r+   r,   r-   rZ   r@   rA   r   r   r   s   @r9   r  r  r  sf        ,d ,d ,d ,d ,d\E$5 %:K        r8   r  aS  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    c                        e Zd ZU eed<   g dZdZdef fdZdej	        fdZ
dej	        dej        fdZeedej        deej        ej        f         fd	                        Z xZS )

JanusVQVAEr%   )rr  r^  rA  r   c                    t                                          |           t          |          | _        t	          |          | _        t          j                            |j	        |j
        d          | _        t          j                            |j
        |j	        d          | _        |                                  t          |          | _        d| _        |                                  d S )Nr   F)rY   rZ   r  r%  rA  quantizer@   r   r_   r  r\   
quant_convpost_quant_convevalr  decoderr   r'  ri   s     r9   rZ   zJanusVQVAE.__init__  s       (001&99(//&*@&BRTUVV$xv/?AWYZ[[		(00&+# 	r8   c                     |                      |          }|                     |          }|                     |          \  }}}|||fS r   )r%  r  r  )rj   r   rE   quantemb_lossindicess         r9   encodezJanusVQVAE.encode  sI    \2266#'==#?#? xh''r8   rV  ro   c                 r   |j         d         | j        j        d         | j        j        d         z  k    r>t          d| j        j        d         | j        j        d         z   d|j          d          | j                            |          }|                     |          }|                     |          }|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        r   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)ru   r  rH  r   r[  r  r  )rj   rV  codebook_entryrE   r   s        r9   decodezJanusVQVAE.decode  s     a DM$B1$EHfghHi$iii9t}GefgGhkokx  lJ  KL  lM  HM 9 9"."49 9 9   99,GG,,^<<||M22r8   c                     |j         d         }|                     |          \  }}}|                     |                    |d                    }t	          ||          S )Nr   rW   )ru   r  r  r~   r<   )rj   r   r   r  r>   r  r=   s          r9   r   zJanusVQVAE.forward  sZ     "'*
)-\)B)B&~w#{{7<<
B+G+GHH 4nEEEr8   )r+   r,   r-   r"   r.   r1   r1  rZ   r@   r\  r  rA   r  r   r   rH   r   r   r   s   @r9   r  r    s           
 %O/      (5#3 ( ( ( (5#3 8I    & F'F 
u %"33	4F F F ^ F F F F Fr8   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVQVAEAlignerMLPr%   c                 0   t                                                       t          j        j        j                  | _        t          j        fdt          dj	                  D                       | _
        t          j                 | _        d S )Nc                 N    g | ]!}t          j        j        j                  "S r7   r7  r   s     r9   r   z1JanusVQVAEAlignerMLP.__init__.<locals>.<listcomp>  s+    qqqRYv,f.CDDqqqr8   r   )rY   rZ   r   r   r\   r8  r   r   r   r   r:  r	   r   r   ri   s    `r9   rZ   zJanusVQVAEAlignerMLP.__init__  s    9V-v/DEE]qqqqeTUW]WoNpNpqqq
 
 $F$56r8   c                     |                      |          }| j        D ]"}|                     |          } ||          }#|S r   r<  r=  s      r9   r   zJanusVQVAEAlignerMLP.forward  r?  r8   )r+   r,   r-   r"   rZ   r   r   r   s   @r9   r  r    sT        7/ 7 7 7 7 7 7      r8   r  c                   L     e Zd ZdZdef fdZdej        dej        fdZ	 xZ
S )JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r%   c                    t                                                       t          j        |j        |j                  | _        t          |j                 | _	        t          j        |j        |j
                  | _        d S r   )rY   rZ   r   r   image_token_embed_dimr8  rx  r	   r   r   rD  vision_headri   s     r9   rZ   zJanusVQVAEHead.__init__  sb    	&">@UVV#F$569V%:F<QRRr8   rE   ro   c                     |                      |          }|                     |          }|                     |          }|S r   )rx  r   r  r   s     r9   r   zJanusVQVAEHead.forward  s?    m44**=99((77r8   )r+   r,   r-   r?   r"   rZ   r@   r   r  r   r   r   s   @r9   r  r    sx        YYS/ S S S S S SU\ el        r8   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                   ~    e Zd Zdef fdZd Zd Zd Zdej	        dej
        dej
        fd	Zee	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej                 deej	                 dee         deej	                 deej
                 dee         deeej        f         fd                        Z xZS )
JanusModelr%   c                    t                                          |           || _        t                              |j                  | _        t          | j        j                  | _        t                              |j
                  | _        t          j        | j        j        j        | j        j        j                  | _        t#          | j        j                  | _        t'          | j        j                  | _        t+          j        |j                  | _        d| _        |                                  d S )N)r%   F)rY   rZ   r%   r#  _from_configvision_configvision_modelr4  alignerr  	vq_configvqmodelr   rd   rD  r\   generation_embeddingsr  generation_alignerr  generation_headr   from_configtext_configlanguage_modelr   r'  ri   s     r9   rZ   zJanusModel.__init__#  s       ,99&:NOO,T->-EFF!..v/?@@ &(\$,2E2TVZVbViVs%t%t""6t|7J"K"K-dl.ABB'36;MNNN&+#r8   c                 4    | j                                         S r   )r  r/  r.  s    r9   r/  zJanusModel.get_input_embeddings8  s    "77999r8   c                 :    | j                             |           d S r   )r  set_input_embeddingsrj   r   s     r9   r  zJanusModel.set_input_embeddings;  s    0077777r8   c                 d    |                      |          }|                     |j                  }|S r   )r  r  rD   )rj   r   image_embedss      r9   get_image_featureszJanusModel.get_image_features>  s/    ((66||L$BCCr8   	input_idsr  image_featuresc                 \   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                                }|	                    d          
                    |                              |j                  }||                                         |                                k    r0|j        d         |j        d         z  }t          d| d|           |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr   devicerW   r   r   z6Image features and image tokens do not match: tokens: z, features )r/  r@   r  r%   image_token_idlongr  allrM  ry   	expand_asr   numelru   r   )rj   r  r  r  special_image_maskn_image_tokensn_image_featuress          r9   get_placeholder_maskzJanusModel.get_placeholder_maskC  s/    !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!*dk.H!H+//11/99"==GGVVYYZgZnoo+,22448L8L8N8NNN-3A69Ma9PPvvvdtvv   "!r8   Nr   r   r   rV   r)   cache_position	use_cachelogits_to_keepc
                    |d u |d uz  rt          d          | |                                 |          }||                     |          }|                    d|j        d                   }|                    |j        |j                  }|                     |||          }|	                    ||          } | j
        d|||||||	d|
}t          |j        |j        |j        |j        ||nd           S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onerW   )r  r  )r  r   rV   r)   r  r  r  )rD   r)   rE   rF   rG   r7   )r   r/  r  rz   ru   r   r  r   r  masked_scatterr  rC   rD   r)   rE   rF   )rj   r  r   r   rV   r)   r  r  r  r  r   r  r  image_attention_mask	lm_outputs                  r9   r   zJanusModel.forward[  sX    -t";< 	s    7D5577	BBM#22<@@L)11"m6I"6MNNN+..}/C]EXYYN#'#<#<~ $= $ $  *889M~^^M'D' 	
')%+))	
 	
 	
 	
	 ,'9%5#1 +0<0Hd
 
 
 	
r8   )	NNNNNNNNr   )r+   r,   r-   r    rZ   r/  r  r  r@   r\  rA   r  r   r   r   r   r
   r   r   r   r   r   r   s   @r9   r  r    s       {      *: : :8 8 8  
")":?:K"]b]n" " " "0  15481537+/5959$(34.
 .
E,-.
 u01.
 !.	.

 u/0.
 "%.
 !!12.
   12.
 D>.
 c5</0.
 .
 .
 ^ .
 .
 .
 .
 .
r8   r  c                   B    e Zd ZddgZdZdef fdZd Zd Zde	j
        d	e	j
        fd
Zee	 	 	 	 	 	 	 	 	 	 ddee	j                 dee	j                 dee	j
                 dee	j                 dee         dee	j                 dee	j                 dee	j                 dee         deee	j
        f         dee         fd                        Z	 	 	 	 	 	 d fd	Zde	j
        fdZe	j        	 	 	 d dee	j
                 dee	j                 dee         f fd            Z xZS )!JanusForConditionalGenerationz(model.language_model.embed_tokens.weightzlm_head.weightTr%   c                    t                                          |           || _        t          |          | _        t          j        |j        j        |j        j	        d          | _
        |                                  d S )NFr   )rY   rZ   r%   r  r&   r   r   r  r[   
vocab_sizelm_headr'  ri   s     r9   rZ   z&JanusForConditionalGeneration.__init__  sn       ''
y!3!?ASA^ejkkk 	r8   c                 >    | j         j                                        S r   )r&   r  r/  r.  s    r9   r/  z2JanusForConditionalGeneration.get_input_embeddings  s    z(==???r8   c                 D    | j         j                            |           d S r   )r&   r  r  r  s     r9   r  z2JanusForConditionalGeneration.set_input_embeddings  s!    
!66u=====r8   inputsro   c                 n    | j                             |          }| j                             |          }|S r   )r&   r  r  )rj   r  rI  s      r9   'prepare_embeddings_for_image_generationzEJanusForConditionalGeneration.prepare_embeddings_for_image_generation  s2    z77??z44\BBr8   Nr   r  r   r   rV   r)   r  r  labelsr  r  r   c                 j    | j         d|||||||	|d|}|j        }t          |
t                    rt	          |
 d          n|
}|                     |dd|ddf                   }d}|  | j        d||| j        j        j	        d|}t          |||j        |j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r  r   r   rV   r)   r  r  r  N)rL   r  r  )rK   rL   r)   rE   rF   rG   r7   )r&   rD   
isinstancer   slicer  loss_functionr%   r  r  rJ   r)   rE   rF   rG   )rj   r  r   r   rV   r)   r  r  r  r  r  r   outputsrE   slice_indicesrL   rK   s                    r9   r   z%JanusForConditionalGeneration.forward  s   , $* 

%)%+')

 

 

 

  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4% f9P9[ _e D +#3!/) ' ;
 
 
 	
r8   c           	      j     t                      j        |f|||||d|}	|d         dk    r||	d<   |	S )N)r)   r  r   r  r  r   r   )rY   prepare_inputs_for_generation)rj   r  r   r)   r   r  r  r  r   model_inputsrk   s             r9   r  z;JanusForConditionalGeneration.prepare_inputs_for_generation  se     =uww<
+')))
 
 
 
 !!!+7L(r8   rV  c                 t    | j         j                            |          }|                    dddd          }|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r   r   r   )r&   r  r  r{   )rj   rV  decoded_images      r9   decode_image_tokensz1JanusForConditionalGeneration.decode_image_tokens  s;     
*11,??%--aAq99r8   logits_processorc           	      z   |                     d| j                  }t          j        |          }|                     dd          }|dk    r t	                      j        d|||d d|S  |j        di |}|                                t          j	        t          j
        fvrt          d          |                                 |                     |                                           ||nt                      }d|d<   |j        !t                               d           d	|_        |j        |d
<   |                     ||j        |          \  }}	}|j        |j        }}
t-          |j                  dk    rt          d|j         d          |d u}|                     |||j                   |j        r9|j        dk    r.|                    t5          |j                             d |_        |                     ||j        d         |d ||          } | j        d|||j        d|\  }}| j        j        j         j!        }|j        \  }}|"                    dd          }|                     dd           }|"                    dd          }||d<   ||d d d f         |j        k    ||d d d f         |j#        d         k    z  }||d d d f         $                    ||j%                    | &                                |          }| '                    |||          }|(                    dd           <| )                    |j*        pd|dz  tW          |j,        ||z             |          |d<   t[          j.        ||f|
|          }|j/        }|j0        }|j1        }|j2        }|j3        }|r|rdnd }|r|rdnd }|r|rdnd }|r|rdnd }ti          |          D ]x} | j5        d||d|}|d         6                    |j                  |d<   |d         6                    |j                  |d<    | j        j7        di |||d}| 8                    ||          }|j9        d d dd d f         :                                } | j        ;                    |           }! |||!          }"|j<        r@t[          j=        |"d          }#t[          j>        |#d          ?                    d          }$nt[          j@        |"d          }$|$|d d |f<   t[          jA        |$|$g          }$|$B                    d          }$| C                    |$          }z|r:|r||!fz  }|r|| D                                fz  }|r
||jE        z  }|r
||jF        z  }|rt          ||!||||jH                  S |S ) Ngeneration_configgeneration_modetext)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  r   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr)   static)cache_implementationr   max_cache_lenmodel_kwargsr  r7   )r  r  r  )output_attentionsoutput_hidden_statesrW   rL  )num_samples)	sequencesscoresrL   rF   rE   r)   )Ipopr  copydeepcopyrY   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  loggerwarning_prepare_model_inputsbos_token_idr   r  r  ru   _prepare_special_tokensr  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr&   r  r%   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idr/  _get_initial_cache_positionget
_get_cacher  max
max_lengthr@   r  r!  r"  output_scoresoutput_logitsreturn_dict_in_generater   r  r   r  #_update_model_kwargs_for_generationrD   cloner  	do_sampler   multinomialsqueezeargmaxr  ry   r  floatrF   rE   r   r)   )&rj   r  r   r  r   r  r  r   r  model_input_namer   r  kwargs_has_attention_maskr8  r   r   input_tokensmaskr  generated_tokensr!  r"  rB  rC  rD  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r	  rI  r%  next_token_scoresprobs
next_tokenrk   s&                                        r9   r)  z&JanusForConditionalGeneration.generate  sd    #JJ':D<RSS M*;<< !**%6??f$$#577# -"3#	 
    0(/99&99 0022>;PR`Rn:oooT   	""$$$##L$5$5$7$7888 0@/K++QdQfQf %)[!+3NNrsss/0,):)I%& 594N4N%2L5
 5
1	#\ ")9vy1$$Fio F F F   %3$$>!$$%68QZcZj$kkk + 	40A0PST0T0T##$IJ[Jj$k$klll/3,  55/!*!3'%)- 6 
 
 #E$"D #
))>#
 #
 	#
 #
	<  :29J'o
G ''1--%))*:DAA'..q!44)7%& Z[[!!!^,0A0NNaaa(,=,OP^,__
 	Z[[!!!^$11$8I8VWWW31133LAA77VV-t44<.2oo%6%K%Wx%>!"3">@PSZ@Z[[) /> / /L*+ !;
4D'EU[abbb .?0E)7)7"3"K3PPRRD
3PPRRD
'> bCW b^b$;\@Q\RRX\'(( #	U #	UA=4= +| GS L .::J-K-N-N}Oc-d-dL)*-9:J-K-N-N}Oc-d-dL)*/dj/  "3%9   G  CCG\ZZL"4QQQAAAX>DDFFL Z//==F 0 0F C C !* E&7R@@@".u!DDDLLRPP

"\*;DDD
%/QQQT" J
#;<<J#--b11J HHTTMM" 	? (vi'
 6|113355
  9"g&88"# ?%)>>%" 
	$,*!-3 ' 7    $#r8   )
NNNNNNNNNr   )NNNNNN)NNN)r+   r,   r-   _tied_weights_keysr5   r    rZ   r/  r  r@   r   r  r   r   r   r\  rA   r
   r   r   r   r   r   r   r  r  no_gradr   r)  r   r   s   @r9   r  r    su       DFVW!{      @ @ @> > >el u|    
  15481537+/5959-1$(341
 1
E,-1
 u011
 !.	1

 u/01
 "%1
 !!121
   121
 )*1
 D>1
 c5</01
 +,1
 1
 1
 ^ 1
l      <
 
 
 
 
 ] *.59:>	|$ |$&|$ !!12|$ ##67	|$ |$ |$ |$ |$ ]|$ |$ |$ |$ |$r8   r  )r$   r  r  r  r#  )r   )Ur'  dataclassesr   typingr   r   r   r@   torch.nn.functionalr   r|   rX  activationsr	   cache_utilsr
   
generationr   r   r   r   generation.utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   autor   configuration_janusr    r!   r"   
get_loggerr+   r0  r$   r<   rC   rJ   r  rN   r   r   r   rK  r   r   r   r(   r   r  r  r  r#  r4  rA  r^  rr  r~  r  r  r  r  r  r  r  r  r  __all__r7   r8   r9   <module>rl     s]  ,  ! ! ! ! ! ! , , , , , , , , , ,                 ! ! ! ! ! !             u u u u u u u u u u u u 9 9 9 9 9 9 9 9 9 9 9 9 X X X X X X X X X X F F F F F F F F & & & & & & ] ] ] ] ] ] ] ] ] ] ] ] ] ] / / / / / /       Q Q Q Q Q Q Q Q Q Q 
	H	%	% 
. 
. 
. 
. 
.? 
. 
. 
.   
	7 	7 	7 	7 	7{ 	7 	7  	7   
C C C C C; C C  C6   
C C C C C+ C C  C4H H H H HBI H H HV	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4I$ I$ I$ I$ I$29 I$ I$ I$X    RY   (         8      F@ @ @ @ @ @ @ @DI) I) I) I) I)RY I) I) I)X    ry       2   D 1 1 1 1 1+ 1 1 1h    BI   $<" <" <" <" <"	 <" <" <"~)( )( )( )( )(BI )( )( )(X &  &  &  &  &")  &  &  &F	 	 	 	 	ry 	 	 	    RY          ,J! J! J! J! J!	 J! J! J!ZA A A A A	 A A AH   :F :F :F :F :F% :F :F :Fz    29   $    RY       
i
 i
 i
 i
 i
% i
 i
 
i
Xt$ t$ t$ t$ t$$8/ t$ t$ t$n	 t
s
sr8   