
     `i                    R   d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(  e            rddl)m*Z* ddl+m,Z,  e"j-        e.          Z/ G d dej0                  Z1	 ddl2m3Z3 e3Z1e/4                    d           n&# e5$ r Y ne6$ r e/7                    d           Y nw xY w G d dej0                  Z8 G d dej0                  Z9 G d dej0                  Z: G d de          Z; G d  d!ej0                  Z<e G d" d#e                      Z=e G d$ d%e=                      Z> G d& d'ej0                  Z? G d( d)ej0                  Z@ G d* d+ej0                  ZA G d, d-ej0                  ZB G d. d/ej0                  ZC G d0 d1e          ZD ed23           G d4 d5e=                      ZE ed63           G d7 d8e=e                      ZFg d9ZGdS ):zPix2Struct modeling file    N)OptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfig)	BlockMask)make_flex_block_causal_maskc                   &     e Zd Zd fd	Zd Z xZS )Pix2StructLayerNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )zc
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr'   zPix2StructLayerNorm.__init__>   sD     	l5:k#:#:;; #    c                 h   |                     t          j                                      d                              dd          }|t          j        || j        z             z  }| j        j        t          j	        t          j
        fv r|                     | j        j                  }| j        |z  S )N   T)keepdim)tor)   float32powmeanrsqrtr,   r+   dtypefloat16bfloat16)r-   hidden_statesvariances      r1   forwardzPix2StructLayerNorm.forwardF   s     !##EM2266q99>>r4>PP%Ht?T4T(U(UU ; ???),,T[->??M{]**r2   )r$   __name__
__module____qualname__r'   rA   __classcell__r0   s   @r1   r#   r#   =   sL        $ $ $ $ $ $+ + + + + + +r2   r#   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )Pix2StructVisionEmbeddingsa-  
    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
    is represented by a vector of `hidden_size` values.
    configreturnNc                 \   t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j        |j                  | _
        t          j        |j                  | _        d S N)r&   r'   r   Linearpatch_embed_hidden_sizer.   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr-   rK   r0   s     r1   r'   z#Pix2StructVisionEmbeddings.__init__k   s}     "	&*H&J\ ] ]L9KLL!|FNF<NOOz&"566r2   flattened_patchesc                 d   |d d d d df                                          }|d d d d df                                          }|d d d d dd f         }|                     |          }|                     |          }|                     |          }||z   |z   }|                     |          }|S )Nr   r   r4   )longrQ   rT   rU   rX   )r-   rZ   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingss          r1   rA   z"Pix2StructVisionEmbeddings.forwardt   s     (111a05577'111a05577-aaaABBh7**+<==
**;77--k::  .0>A
\\*--
r2   )
rC   rD   rE   __doc__r   r'   r)   TensorrA   rF   rG   s   @r1   rJ   rJ   d   s|         7/ 7D 7 7 7 7 7 7 %,        r2   rJ   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )Pix2StructVisionAttentionc                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        | j        | j        z  | _	        t          j        | j        | j	        d          | _        t          j        | j        | j	        d          | _        t          j        | j        | j	        d          | _        t          j        | j	        | j        d          | _        d| _        d S NFbias)r&   r'   r.   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutrX   	inner_dimr   rO   querykeyvalueoutputgradient_checkpointingrY   s     r1   r'   z"Pix2StructVisionAttention.__init__   s    !-"(+1/(?? Yt/eLLL
9T-t~EJJJYt/eLLL
i0@uMMM&+###r2   NFc                     |j         dd         \  } fd} |                     |                    } |                     |                    }	 |                     |                    }
t	          j        ||	                    dd                    }|t	          j        d j        ||f|j	        |j
                  } j        r j        rd|_        |                                dk    r,||ddddddf                             |j	                  z   }nn|||                    |j	                  z   }nNt!                      s@t	          j        |f|j	        |j
                  }||                    |j	                  z   }d|z
  }|                    |dk    t	          j        |j
                  j                  }||z  }t	          j        |t	          j        t	          j        |j
                  j                            }t.          j                            |dt          j        	                              |          }t.          j                            | j         j        
          }|||z  }t	          j        ||
          }|                    dd                                                              d j                  }                      |          }|f|fz   }|r||fz   }|S )z&
        Self-attention block
        Nr4   c                     |                                                      dj        j                                      dd          S )
projectionr5   r   r4   )
contiguousviewrm   rk   	transpose)states
batch_sizer-   s    r1   to_projection_shapez>Pix2StructVisionAttention.forward.<locals>.to_projection_shape   s@    $$&&++JDL$Jabbllmnpqrrrr2   r   r   devicer<   Tr5   )dimr<   ptraining)!shaperp   rq   rr   r)   matmulrz   zerosrm   r   r<   rt   r   requires_gradr   r7   r   r*   masked_fillfinfominmaxtensorr   
functionalsoftmaxr8   type_asrX   rx   ry   ro   rs   )r-   r?   attention_maskposition_biaslayer_head_maskoutput_attentions
seq_lengthr}   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsr|   s   `               @r1   rA   z!Pix2StructVisionAttention.forward   s$    "/!4RaR!8
J	s 	s 	s 	s 	s 	s +*4::m+D+DEE )(-)@)@AA
**4::m+D+DEE lJ,@,@A,F,FGG !KDL*j9&-W]Wc  M * 3t} 3.2+!!##q(( -qqq$aaa?O0P0S0STaTh0i0i i+ -0A0A-BV0W0W W-// X!&,]5IQ^Qd" " " !.0A0A-BV0W0W W-M,88!9KU[Y_YeMfMfMjkk&&65<FL0I0I0M#N#NOO },,V5=,QQYYZ`aa },,\T\TXTa,bb &'/9Ll<>> "++Aq11<<>>CCJPRTXTbcckk+...M#33 	0/Gr2   )NNNFrB   rG   s   @r1   re   re      sb        , , , , ,& M M M M M M M Mr2   re   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructVisionMlprK   c                    t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j	        |j
                  | _        t          |j                 | _        d S rg   r&   r'   r   rO   r.   d_ffwi_0wi_1worV   rW   rX   r   dense_act_fnactrY   s     r1   r'   zPix2StructVisionMlp.__init__       If0&+EJJJ	If0&+EJJJ	)FK);%HHHz&"566&-.r2   c                    |                      |                     |                    }|                     |          }||z  }|                     |          }t	          | j        j        t          j                  r]|j	        | j        j        j	        k    rC| j        j        j	        t          j
        k    r$|                    | j        j        j	                  }|                     |          }|S rN   r   r   r   rX   
isinstancer   r+   r)   rc   r<   int8r7   r-   r?   hidden_geluhidden_linears       r1   rA   zPix2StructVisionMlp.forward       hhtyy7788		-00#m3]33 tw~u|44	C#tw~';;;$
22),,TW^-ABBM..r2   )rC   rD   rE   r   r'   rA   rF   rG   s   @r1   r   r      sT        /5 / / / / / /      r2   r   c                        e Zd Zdeddf fdZ	 	 	 ddej        deej                 deej                 d	ede	e
ej        ej        f         e
ej                 f         f
d
Z xZS )Pix2StructVisionLayerrK   rL   Nc                 >   t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |j	        |j
                  | _        t          |j	        |j
                  | _        d S )Nr   r/   )r&   r'   chunk_size_feed_forwardseq_len_dimre   	attentionr   mlpr#   r.   layer_norm_epspre_mlp_layer_normpre_attention_layer_normrY   s     r1   r'   zPix2StructVisionLayer.__init__  s    '-'E$26::&v.."5f6HfNc"d"d"d(;F<NTZTi(j(j(j%%%r2   Fr?   r   	head_maskr   c                     |}|                      |          }|                     ||||          }|d         }|dd          }||z   }|                     |          }	|                     |	          |z   }	|	f|z   }|S )N)r   r   r   r   r   )r   r   r   r   )
r-   r?   r   r   r   residualself_attention_outputsattention_outputr   layer_outputs
             r1   rA   zPix2StructVisionLayer.forward  s     ! 55mDD!%)%/	 "0 "
 "
 2!4(, )83 ..}==xx--=/G+r2   )NNF)rC   rD   rE   r   r'   r)   rc   r   boolr   tuplerA   rF   rG   s   @r1   r   r     s        k/ kD k k k k k k 26,0"' | !. EL)	
   
uU\5</0%2EE	F       r2   r   c                        e Zd Zdeddf fdZ	 	 	 	 	 ddej        deej                 d	eej                 d
ededede	e
ef         fdZ xZS )Pix2StructVisionEncoderrK   rL   Nc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S  )r   ).0_rK   s     r1   
<listcomp>z4Pix2StructVisionEncoder.__init__.<locals>.<listcomp>6  s"    #k#k#ka$9&$A$A#k#k#kr2   F)	r&   r'   rK   r   
ModuleListrangenum_hidden_layerslayerrt   rY   s    `r1   r'   z Pix2StructVisionEncoder.__init__3  sa    ]#k#k#k#k5QWQiKjKj#k#k#kll
&+###r2   FTr?   r   r   r   output_hidden_statesreturn_dictc                 .   |rdnd }|rdnd }t          | j                  D ]=\  }	}
|r||fz   }|||	         nd } |
||||          }|d         }|r||d         fz   }>|r||fz   }|st          d |||fD                       S t          |||          S )Nr   r   r   c              3      K   | ]}||V  	d S rN   r   r   vs     r1   	<genexpr>z2Pix2StructVisionEncoder.forward.<locals>.<genexpr>V  s(      mmq_`_l_l_l_l_lmmr2   last_hidden_stater?   
attentions)	enumerater   r   r   )r-   r?   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_moduler   layer_outputss                r1   rA   zPix2StructVisionEncoder.forward9  s    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO(LYjkkM)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r2   )NNFFT)rC   rD   rE   r   r'   r)   rc   r   r   r   r   r   rA   rF   rG   s   @r1   r   r   2  s        ,5 ,$ , , , , , , 26,0"'%* "
 "
|"
 !."
 EL)	"

  "
 #"
 "
 
uo%	&"
 "
 "
 "
 "
 "
 "
 "
r2   r   c                   @    e Zd ZU eed<   dZed             Zd Zd Z	dS )Pix2StructPreTrainedModelrK   Fc                 v    t          j        t                    }t          j        t                    }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r)   r   r   r   )r-   r   
input_maskdummy_inputss       r1   r   z&Pix2StructPreTrainedModel.dummy_inputsd  s=    L..	\*--
!*"&0
 

 r2   c                    | j         j        }t          |t                    r$|j        j                            |dz             dS t          |t                    rt          | j         t                    r| j         j	        j
        n| j         j
        }t          | j         t                    r| j         j	        j        n| j         j        }|j        j        j                            d||dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d||dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d||dz  z             t          |j        d          r1|j        j        '|j        j        j                                         dS dS dS t          |t&                    rt          | j         t                    r| j         j	        j
        n| j         j
        }t          | j         t                    r| j         j	        j        n| j         j
        }t          | j         t                    r| j         j	        j        n| j         j        }|j        j        j                            d|||z  dz  z             |j        j        j                            d||dz  z             |j        j        j                            d||dz  z             |j        j        j                            d|||z  dz  z             |j        r.|j        j        j                            d||dz  z             dS dS t          |t8          j                  rt          | j         t                    r| j         j	        j
        n| j         j
        }|j        j                            d||dz  z             |j        +|j        j        |j                                                  dS dS t          |t>                    ret          | j         t                    r| j         j	        j
        n| j         j
        }|j         j        j                            d||dz  z             dS t          |t8          j!        t8          j"        f          rt8          j#        $                    |j        j        %                    tL          j'                  d| j         j(                  %                    |j        j)                  |j        _        |j         |j        j                                         dS dS t          |t                    r*|j        !|j        j                            d           dS dS t          |t8          j                  r]|j        j                            d| j         j(                   |j        -|j        j        |j                                                  dS dS dS )zInitialize the weights      ?        g      )r:   stdri   N)*rK   initializer_factorr   r#   r+   datafill_ Pix2StructTextDenseGatedActDenser   text_configr.   r   r   normal_hasattrri   zero_r   r   Pix2StructTextAttentionrj   	num_headsrp   rq   rr   rs   has_relative_attention_biasrelative_attention_biasr   rR   padding_idxPix2StructTextModellm_headrO   Conv2dinittrunc_normal_r7   r)   r8   initializer_ranger<   )r-   modulefactorr.   r   rk   rm   s          r1   _init_weightsz'Pix2StructPreTrainedModel._init_weightso  s   /f122 J	?M$$Vc\22222 @AA H	? dk+;<<-'33[, 
 4>dkK[3\3\r4;*//bfbmbrDK#++&[UYDY:Z+[[[v{F++ .0@0L %++---K#++&[UYDY:Z+[[[v{F++ .0@0L %++---I!))s4D.8Q)RRRvy&)) ,fin.H	#))+++++, ,.H.H 788 7	?
 dk+;<<-'33[,  1;4;HX0Y0Yv',,_c_j_v 
 dk+;<<+'11[*  L$,,#6kTfFfkoEo;p,qqqJ"**;PTCT9U*VVVL$,,#6[RVEV;W,XXXM %--3FwQcGchlFl<m-nnn1 s.5:BBQW\glp[pQqBrrrrrs s--  	? dk+;<<-'33[,  M&&CVPT?T5U&VVV!-"6#56<<>>>>> .- 344 	? dk+;<<-'33[,  N!&..CVX\G\=].^^^^^BI 677 	? "$!6!6"%%em443DKDa "7 " "b$%% M {& &&((((( '& 344 	?}("((----- )(-- 	?M&&CT[5R&SSS!-"6#56<<>>>>>	? 	?--r2   c                    | j         j        }| j         j        }|t          d          t	          |          rHt          j        |j        d d         dz   |          }t          j        ||dd df         gd          }nD|	                    |j                  }|dd df         
                                |ddd f<   ||d<   |t          d          |                    |d	k    |           |S )
Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information.r5   )r   .r   r   ).r   z1self.model.config.pad_token_id has to be defined.)rK   decoder_start_token_idpad_token_id
ValueErrorr   r)   fullr   cat	new_zerosclonemasked_fill_)r-   r   r  r  shifted_input_idss        r1   _shift_rightz&Pix2StructPreTrainedModel._shift_right  s   !%!C{/!)<   Y'' 	? %
9?3B3+?$+FH^ _ _ %	+<iSbS>Q*RXZ [ [ [ ) 3 3IO D D)238)<)B)B)D)Dc122g&(>f%PQQQ&&'8D'@,OOO  r2   N)
rC   rD   rE   r   __annotations___can_compile_fullgraphpropertyr   r   r  r   r2   r1   r   r   ^  sc         "  XM? M? M?`! ! ! ! !r2   r   c                   (    e Zd ZU eed<   dZdZdgZdef fdZd Z	de
eee         f         dd	fd
Ze	 	 	 	 	 	 ddeej                 deej                 deej                 dee         dee         dee         deeef         fd            Z xZS )Pix2StructVisionModelrK   rZ   Tr   c                    t                                          |           || _        t          |          | _        t          |          | _        t          |j        |j	                  | _
        |                                  d S Nr   )r&   r'   rK   rJ   r_   r   encoderr#   r.   r   	layernorm	post_initrY   s     r1   r'   zPix2StructVisionModel.__init__  sr       4V<<.v66,V-?VEZ[[[ 	r2   c                     | j         j        S rN   )r_   rQ   r-   s    r1   get_input_embeddingsz*Pix2StructVisionModel.get_input_embeddings  s    //r2   heads_to_prunerL   Nc                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   prune_heads)r-   r  r   headss       r1   _prune_headsz"Pix2StructVisionModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr2   r   r   r   r   r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |,|                    d          dk                                    }|                     || j         j                  }| 	                    |          }| 
                    ||||||          }|d         }	|                     |	          }	|s|	f}
|
|dd         z   S t          |	|j        |j                  S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
            Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
            [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
            paper](https://huggingface.co/papers/2210.03347) (figure 5) for more details.

        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, Pix2StructVisionModel

        >>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 2048, 768]
        ```
        Nz%You have to specify flattened_patchesr5   r  r   )r   r   r   r   r   r   r   )rK   r   r   use_return_dictr  sumfloatget_head_maskr   r_   r  r  r   r?   r   )r-   rZ   r   r   r   r   r   embedding_outputencoder_outputssequence_outputhead_outputss              r1   rA   zPix2StructVisionModel.forward  sM   L 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]$DEEE!/333;;q@GGIIN &&y$+2OPP	??+<==,,)/!5# ' 
 
 *!,..99 	6+-L/!"""555-)7&1
 
 
 	
r2   )NNNNNN)rC   rD   rE   r   r  main_input_namesupports_gradient_checkpointing_no_split_modulesr'   r  dictintlistr   r   r   r)   rc   r   r   r   r   rA   rF   rG   s   @r1   r  r    sd        """")O&*#01
5 
 
 
 
 
 
0 0 0C4T#Y+? CD C C C C  5915,0,0/3&*N
 N
#EL1N
 !.N
 EL)	N

 $D>N
 'tnN
 d^N
 
u00	1N
 N
 N
 ^N
 N
 N
 N
 N
r2   r  c                   *     e Zd Zdef fdZd Z xZS )r   rK   c                    t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j	        |j
                  | _        t          |j                 | _        d S rg   r   rY   s     r1   r'   z)Pix2StructTextDenseGatedActDense.__init__M  r   r2   c                    |                      |                     |                    }|                     |          }||z  }|                     |          }t	          | j        j        t          j                  r]|j	        | j        j        j	        k    rC| j        j        j	        t          j
        k    r$|                    | j        j        j	                  }|                     |          }|S rN   r   r   s       r1   rA   z(Pix2StructTextDenseGatedActDense.forwardU  r   r2   rC   rD   rE   r   r'   rA   rF   rG   s   @r1   r   r   L  sT        /3 / / / / / /      r2   r   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructTextLayerFFrK   c                     t                                                       t          |          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S r  )r&   r'   r   DenseReluDenser#   r.   layer_norm_epsilon
layer_normr   rV   rW   rX   rY   s     r1   r'   zPix2StructTextLayerFF.__init__j  s[    >vFF-f.@fF_```z&"566r2   c                     |                      |          }|                     |          }||                     |          z   }|S rN   )r9  r7  rX   )r-   r?   forwarded_statess      r1   rA   zPix2StructTextLayerFF.forwardr  sF    ??=99../?@@%5E(F(FFr2   r3  rG   s   @r1   r5  r5  i  sT        73 7 7 7 7 7 7      r2   r5  c                        e Zd Z	 ddedee         f fdZedd	            Zdd
Z	 e
ddd          	 	 	 	 	 	 	 	 	 dd            Z xZS )r   FNrK   	layer_idxc                 *   t                                                       || _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _	        |j
        | _        | j	        | j        z  | _        || _        |(t                              d| j        j         d           t%          j        | j        | j        d          | _        t%          j        | j        | j        d          | _        t%          j        | j        | j        d          | _        t%          j        | j        | j        d          | _        | j        r$t%          j        | j        | j	                  | _        t5                      | _        d| _        d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Frh   )r&   r'   r   relative_attention_num_bucketsrelative_attention_max_distancer.   rj   rk   r   rm   rW   rX   ro   r=  loggerwarning_oncer0   rC   r   rO   rp   rq   rr   rs   rR   r   setpruned_headsrt   r-   rK   r   r=  r0   s       r1   r'   z Pix2StructTextAttention.__init__z  sn    	+F(.4.S+/5/U,!-"(+'*(??",4>+B , , ,   Yt/1ANNN
9T-t/?eLLLYt/1ANNN
i 0$2BOOO+ 	k+-<8[]a]i+j+jD(EE&+###r2   T       c                 P   d}|rC|dz  }|| dk                         t          j                  |z  z  }t          j        |           } n(t          j        | t          j        |                      } |dz  }| |k     }|t          j        |                                 |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j	        ||dz
                      }|t          j
        || |          z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r4   r   )r7   r)   r\   absr   
zeros_likelogr$  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r1   _relative_position_bucketz1Pix2StructTextAttention._relative_position_bucket  s>   .  	cAK!2Q!6 : :5: F F TT %	*; < <!&+<e>NO`>a>a!b!b b  1$	$y0 &/I'--//);<<h|i/001Y&( "UZ..	&"
 &+Y&8RT_bcTc(d(d&
 &
" 	EK2CE_```r2   c                    || j         j        j        }|,t          j        |t          j        |          dddf         }n|dddf                             |          }t          j        |t          j        |          dddf         }||z
  }|                     |d| j        | j	                  }|                      |          }	|	
                    g d                              d          }	|	S )z%Compute binned relative position biasN)r<   r   F)rP  rQ  rR  )r4   r   r   r   )r   r+   r   r)   aranger\   r7   rW  r?  r@  permute	unsqueeze)
r-   query_length
key_lengthr   cache_positioncontext_positionmemory_positionrO  relative_position_bucketvaluess
             r1   compute_biasz$Pix2StructTextAttention.compute_bias  s   >18?F!$|L
SYZZZ[\[\[\^b[bc-aaag699&AA,zFSSSTXZ[Z[Z[T[\+.>>#'#A#A;=	 $B $
 $
  --.FGG			**44Q77r2   past_key_valuepast_key_values4.58new_nameversionc                    |j         dd         \  }}|du}|                     |          }|                    |d| j        | j                                      dd          }|Ft          |t                    r1|j        	                    | j
                  }|r|j        }n
|j        }n|}|r|n|}|r3|r1|r/|j        | j
                 j        }|j        | j
                 j        }n|                     |          }|                     |          }|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }|9|s|
nd}
|                    ||| j
        d|
i          \  }}|rd|j        | j
        <   t'          j        ||                    dd                    }||j         d         }||n
|
d         dz   }| j        s@t'          j        d| j        ||f|j        |j        	          }| j        r| j        rd|_        n3|                     |||j        |

          }|dddd| dddf         }|$|ddddddd|j         d         f         }||z   }| j        rUt'          j        |j         d                   }d|t?          | j                  <   |dd|                                 f         }n|}||z  }tB          j"        #                    |$                                d          %                    |          }tB          j"        &                    || j&        | j                  }|||z  }t'          j        ||          }|                    dd          '                                }|                    |d| j(                  }| )                    |          }||f}|	r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr4   r5   r   r^  Tr   r~   )r   r^  r   r  r   )*r   rp   ry   rm   rk   rz   r   r
   
is_updatedgetr=  cross_attention_cacheself_attention_cachelayerskeysrb  rq   rr   updater)   r   r   r   r   r<   rt   r   r   rc  rD  r*   r/  r   r   r   r   r$  r   rX   rx   ro   rs   )r-   r?   maskkey_value_statesr   re  r   r\  	use_cacher   r^  r|   r   is_cross_attentionr   rl  curr_past_key_valuecurrent_statesr   r   r   r]  real_seq_lengthcausal_maskr   r   r   r   s                               r1   rA   zPix2StructTextAttention.forward  sf   & "/!4RaR!8
J .T9zz-00#((RtG^__iijkmnoo &:oGZ+[+[&(377GGJ! K&5&K##&5&J##"1-?R))] 	F/ 	Fj 	F,3DNCHJ.5dnELLL.11J::n55L#RtG^__iijkmnooJ',,ZT\4KbccmmnoqrssL*7I!St+>+E+Edn?OQ_>`, ,(
L & FAEO.t~> lJ,@,@A,F,FGG #)"-J.:.FllN[]L^abLbO3 
E %j*=fm[a[g! ! ! . 74= 726M/ $ 1 1#ZVd !2 ! ! !.aaaZKLL!!!.C D"111aaa,Bj.>r.B,B#BC - ; 	1:m1!455D,-Dd'(()#0DIIKK#@  #0 && },,V\\^^,DDLLVTT},,\T\TXTa,bb &'/9Ll<>>!++Aq11<<>>!&&z2t~FFkk+... 	0/Gr2   FN)TrF  rG  )NN)	NNNNNNFFN)rC   rD   rE   r   r   r.  r'   staticmethodrW  rc  r   rA   rF   rG   s   @r1   r   r   y  s        jn, ,*,ZbcfZg, , , , , ,> -  -  -  \- `   * _%0A6RRR l l l SRl l l l lr2   r   c                   p     e Zd Zd
dee         f fdZ eddd          	 	 	 	 	 	 	 dd	            Z xZS ) Pix2StructTextLayerSelfAttentionFNr=  c                     t                                                       t          |||          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )Nr   r=  r   r&   r'   r   r   r#   r.   r8  r9  r   rV   rW   rX   rE  s       r1   r'   z)Pix2StructTextLayerSelfAttention.__init__R  sl    00KW`
 
 
 .f.@fF_```z&"566r2   rd  re  rf  rg  c	           
          |                      |          }	|                     |	|||||||          }
||                     |
d                   z   }|f|
dd          z   }|S )N)rs  r   r   re  ru  r   r^  r   r   r9  r   rX   )r-   r?   r   r   r   re  ru  r   r^  normed_hidden_statesr   r   s               r1   rA   z(Pix2StructTextLayerSelfAttention.forwardZ  s      $}==>> '++/) * 	
 	
 &5Ea5H(I(II "%5abb%99r2   r{  )NNNNFFN	rC   rD   rE   r   r.  r'   r   rA   rF   rG   s   @r1   r~  r~  Q  s        7 7XVY] 7 7 7 7 7 7 _%0A6RRR    SR    r2   r~  c                   r     e Zd Zd
dee         f fdZ eddd          	 	 	 	 	 	 	 	 dd	            Z xZS )!Pix2StructTextLayerCrossAttentionNr=  c                     t                                                       t          |d|          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )NFr  r   r  )r-   rK   r=  r0   s      r1   r'   z*Pix2StructTextLayerCrossAttention.__init__x  sc    0UZfoppp-f.@fF_```z&"566r2   rd  re  rf  rg  Fc                     |                      |          }|                     |||||||||	|

  
        }||                     |d                   z   }|f|dd          z   }|S )N)	rs  rt  r   r   re  ru  r\  r   r^  r   r   r  )r-   r?   rt  r   r   r   re  ru  r\  r   r^  r  r   r   r   s                  r1   rA   z)Pix2StructTextLayerCrossAttention.forward~  s      $}==>> -'++%/) * 
 
 %t||4DQ4G'H'HH/$4QRR$88r2   rN   )NNNNFNFNr  rG   s   @r1   r  r  w  s        7 7(3- 7 7 7 7 7 7 _%0A6RRR
    SR    r2   r  c                   z     e Zd Zddee         f fdZ eddd          	 	 	 	 	 	 	 	 	 	 	 	 dd
            Z xZS )Pix2StructTextBlockFNr=  c                     t                                                       t          |||          | _        t	          ||          | _        t          |          | _        d S )Nr  )r=  )r&   r'   r~  self_attentionr  encoder_decoder_attentionr5  r   rE  s       r1   r'   zPix2StructTextBlock.__init__  sn    >(C
 
 
 *K*
 *
 *
&
 )00r2   rd  re  rf  rg  Tc                    |                      |||||	|
||          }|d         }|dd          }|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }|d u}|r| 	                    ||||||	|d         dz   |
|	  	        }|d         }|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }||dd          z   }| 
                    |          }|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }|f}||z   S )N)r   r   r   re  ru  r   r^  r   r   i  )r   r   r5   )rt  r   r   r   re  r\  ru  r   )r  r<   r)   r=   isinfanyr   r   clampr  r   )r-   r?   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskre  ru  r   r   r^  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                       r1   rA   zPix2StructTextBlock.forward  s   " "&!4!4)'++/) "5 	"
 	"
 /q121226 %-//EK4N4N4R4R4T4T/+m&9::>EK!KK<[YYYM2$> 	P&*&D&D!65; : /+B/!3#"3 'E 
' 
'# 4A6M "em33M8R8R8V8V8X8X3#k-*=>>BTI %M|Q\ ] ] ] !24KABB4O O // %-//EK4N4N4R4R4T4T/+m&9::>EK!KK<[YYYM "***r2   r{  )NNNNNNNNFFTNr  rG   s   @r1   r  r    s        1 1XVY] 1 1 1 1 1 1  _%0A6RRR "#&*#'C+ C+ C+ SRC+ C+ C+ C+ C+r2   r  z3
    The standalone text decoder of Pix2Struct
    )custom_introc            #       z    e Zd ZU eed<   dgZdgZdZ fdZd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 d"deej                 d	eej                 d
eej                 deej                 deej                 deej                 deej                 dee         dee         dee         dee         deej                 dee         deej                 deeej        df         ef         fd            Z	 d#d	eej        df         dej        dej        dedef
dZed	ej        dededej        dej        d efd!            Z xZS )$r   rK   r  zlm_head.weightTc                     t                                                     t          j        j        j                  | _        t          j        fdt          j	                  D                       | _
        t          j        j                  | _        t          j        j                  | _        t          j        j        j        d          | _        |                                  d| _        d S )Nc           	      V    g | ]%}t          t          |d k              |          &S )r   r  )r  r   )r   r   rK   s     r1   r   z0Pix2StructTextModel.__init__.<locals>.<listcomp>  sD        $FQRSV`abbb  r2   r   Frh   )r&   r'   r   rR   
vocab_sizer.   embed_tokensr   r   
num_layersr   r#   r8  final_layer_normrV   rW   rX   rO   r   r  rt   rY   s    `r1   r'   zPix2StructTextModel.__init__  s       L):F<NOO]   v011  
 

 !4F4FFLe f f fz&"566y!3V5FUSSS 	&+###r2   c                     || _         d S rN   )r  r-   new_embeddingss     r1   set_input_embeddingsz(Pix2StructTextModel.set_input_embeddings  s    *r2   Nr   r   r  r  inputs_embedsr   cross_attn_head_maskre  ru  r   r   labelsr   r^  rL   .c                 	   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }| j        r%| j        r|	rt                              d           d}	||t          d          |1|
                                }|                    d|d                   }n.||
                                dd         }nt          d          |&| j        
J d            |                     |          }|\  }}|	rZ|X| j         j        r7t          t          | j                   t          | j                             }nt          | j                   }d	}|	|d	         }n||                                }|t#          j        |||z   |j        
          }|7||                                |z   n|}t#          j        |||j        
          }| j         j        r6|                     |||t/          |t                    r|j        n||
          }nO|ddddddf         }|                    |j                  }d|z
  t#          j        |j                  j        z  }|O|
                                \  }}}||f}|t#          j        ||j        
          }|                     |          }nd}|                     || j         j                  }|                     || j         j                  }|rdnd}|
rdnd}|
rdnd}d}d}|                      |          } tC          | j"                  D ]n\  }!}"||!         }#||!         }$|r|| fz   } |"| ||||||#|$||	|
|          }%|%d	         } |%d         }||%|
rdnd         }|
r||%d         fz   }|||%d         fz   }o| #                    |           } |                      |           } | $                    |           }&|r|| fz   }d}'||                    |&j                  }tK          j&        dd          }( |(|&'                                                    d|&
                    d                    |'                                                    d                    }'|stQ          d |'|&||||fD                       S tS          |'|&||||          S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
            embeddings so you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
            Training](./t5#training).
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoProcessor, Pix2StructTextModel

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss
        ```
        NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer5   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddings)rK   r   r   )r<   r   r   )r   r  re  ru  r   r^  r   r   r4      r  r:   )ignore_index	reductionc              3      K   | ]}||V  	d S rN   r   r   s     r1   r   z.Pix2StructTextModel.forward.<locals>.<genexpr>  s4         =  !=== r2   )losslogitsre  r?   r   cross_attentions)*rK   ru  r   r   r"  rt   r   rA  warningr  sizery   r  is_encoder_decoderr
   r	   get_seq_lengthr)   rY  r   r*   
is_decoder_update_causal_maskr   ro  r7   r<   r   r   invert_attention_maskr%  r  rX   r   r   r  r   r   CrossEntropyLossrx   r   r   ))r-   r   r   r  r  r  r   r  re  ru  r   r   r  r   r^  kwargsinput_shaper|   r   past_key_values_lengthmask_seq_lengthrz  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   r  r?   r   r   r   r  r   r  r  loss_fcts)                                            r1   rA   zPix2StructTextModel.forward  s   f "+!6IIDK<Q	1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]& 	4= 	Y 	NNl   I ]%>sttt"#..**K!r;r?;;II&',,..ss3KKdeee $002p000 --i88M!,
J 	C0{- C"5 444l$+6V6V6V# # #/dk"B"B"B!"%%3A%6""(%4%C%C%E%E"!"\&(>(KTaTh  N ! BQA\..00:==bl  #Z
OML`aaaN;! 	U22o/BCC%44$! KK )D$)9:K%..}/B.CCK,M<O0P0P0TTK !,=R=W=W=Y=Y: 7$68O#P %-).4HQ^Qe)f)f)f&.2.H.HI_.`.`++.2+ &&y$+2HII	#112FH^__"6@BBD0:d&7Brrd(,%]33(44 !	V !	VOA|'lO)=a)@&# I$58H$H!(L%/- /+E /#"3-  M *!,M
 *!,M$00=CT>[aaZ[0\-  V!/=3C2E!E(4+?=QRCSBU+U(--m<<]33m,,   	E 1]4D DYYv}--F*OOOH8F--//44RRII6K\K\K^K^KcKcdfKgKghhD 	   #%"(      1++%1
 
 
 	
r2   Fr    input_tensorc           	      $   | j         j        dk    r||dk                                    r|S d S | j         j        dk    r+t          |t          j                  rt          |          }|S ||                                nd}||j        nd}| j         j        dk    r#|s!|st          j
        |||| j                  rd S |j        }|j        d         }	|r|                                }
n/t          |t          j                  r|j        d	         n||	z   dz   }
|                     ||	|
|||j        d         
          }| j         j        dk    r@|>|j        j        dv r0|s.t	          j        |          j        }t          j        ||          }|S )Nflash_attention_2r   flex_attentionr   Fsdpa)r  r  is_trainingr   r5   )sequence_lengthtarget_lengthr<   r^  r|   )cudaxpunpu)rK   _attn_implementationr  r   r)   rc   r!   r  is_compileabler   _ignore_causal_mask_sdpar   r<   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r   _unmask_unattended)r-   r   r  r^  re  r   past_seen_tokensusing_compilable_cacher<   r  r  rz  	min_dtypes                r1   r  z'Pix2StructTextModel._update_causal_mask  s    ;+/BBB)~/D.I.I.K.K)%%4;+/???.%,77 M!<^!L!L!!
 @O?Z?99;;;`aCRC^!?!?di ;+v55>T5]n5%>*'7 M	    t"&,Q/! 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*.DDD% E E**.I0CKQZ[[Kr2   r  r  r<   r|   c                    | |                                  dk    r| }nMt          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	|ddddddd|	f         | ddddddf                             |j                  z   }
|
dk    }
|ddddddd|	f                             |
|          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuer<   r   r   )diagonalr  r5   r   )r   r)   r   r   r  r   triurY  reshapeexpandr
  r   r7   r   )r   r  r  r<   r^  r|   r  rz  r  mask_lengthpadding_masks              r1   r  zIPix2StructTextModel._prepare_4d_causal_attention_mask_with_cache_position5  s   > %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdDgDg&E E    ,q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r2   )NNNNNNNNNNNNNN)F)rC   rD   rE   r   r  r,  _tied_weights_keysr+  r'   r  r   r   r)   
LongTensorFloatTensorrc   r   r   r   r   r   rA   r  r|  r.  r<   r  rF   rG   s   @r1   r   r     s         !   ./*+&*#, , , , ,&+ + +  156:=A>B48157;+/$(,0/3-1&*59V
 V
E,-V
 !!23V
  ((9:	V

 !)): ;V
   01V
 E-.V
 'u|4V
 "%V
 D>V
 $D>V
 'tnV
 )*V
 d^V
 !!12V
" 
uU&+,.OO	P#V
 V
 V
 ^V
~ #(B BelK78B lB 	B
 B  B B B BH 444 4 {	4
 4 4 4 4 \4 4 4 4 4r2   r   zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c            &       R    e Zd ZU eed<   dZdgZdef fdZd Zd Z	de
j        fdZd	 Zd
 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   dee         deej                 deej                 dee         dee         dee         dee         deej                 deeej                 ef         f"d            Z xZS )"Pix2StructForConditionalGenerationrK   rZ   zdecoder.lm_head.weightc                     t                                          |           t          |j                  | _        t          |j                  | _        |j        | _        | 	                                 d S rN   )
r&   r'   r  vision_configr  r   r   decoderis_vqar  rY   s     r1   r'   z+Pix2StructForConditionalGeneration.__init__x  s`       ,V-ABB*6+=>>m 	r2   c                 4    | j                                         S rN   )r  r  r  s    r1   r  z7Pix2StructForConditionalGeneration.get_input_embeddings  s    |00222r2   c                 :    | j                             |           d S rN   )r  r  r  s     r1   r  z7Pix2StructForConditionalGeneration.set_input_embeddings  s    )).99999r2   rL   c                 4    | j                                         S rN   )r  get_output_embeddingsr  s    r1   r  z8Pix2StructForConditionalGeneration.get_output_embeddings  s    |11333r2   c                 :    | j                             |           d S rN   )r  set_output_embeddingsr  s     r1   r  z8Pix2StructForConditionalGeneration.set_output_embeddings  s    **>:::::r2   c                     | j         S rN   )r  r  s    r1   get_encoderz.Pix2StructForConditionalGeneration.get_encoder  s
    |r2   Nr   r   r   r   decoder_head_maskr  r'  re  r  decoder_inputs_embedsru  r   r   r   r^  c                    ||n| j         j        j        }||n| j         j        }||                     ||||||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|d         }|
W|U|S|                     |
          }||n0|	                    | j         j
                                                  }d|dddf<   |                     ||||	||||||||
||          }|s||z   S t          |j        |j        |j        |j        |j        |j        |j        |j        |j        	  	        S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
            `num_channels` * `patch_size` * `patch_size`

            The process of flattening the pixel patches is done by `Pix2StructProcessor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss for the decoder.

        Example:

        Inference:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> # autoregressive generation
        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A stop sign is on a street corner.

        >>> # conditional generation
        >>> text = "A picture of"
        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A picture of a stop sign with a red stop sign
        ```

        Training:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "A stop sign is on the street corner."

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> labels = processor(text=text, return_tensors="pt").input_ids

        >>> # forward pass
        >>> outputs = model(**inputs, labels=labels)
        >>> loss = outputs.loss
        >>> print(f"{loss.item():.5f}")
        5.94282
        ```N)rZ   r   r   r   r   r   r   r   r4   r   )r   r   r  re  r  r  r   r  ru  r   r   r  r   r^  )	r  r  re  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)rK   r   ru  r"  r  r   r   lenr  ner  r$  r  r   r  r  re  r?   r   r  r   )r-   rZ   r   r   r   r   r  r  r'  re  r  r  ru  r   r   r   r^  r?   decoder_outputss                      r1   rA   z*Pix2StructForConditionalGeneration.forward  s   d "+!6IIDK<S<]	%0%<kk$+B] ""ll"3-#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O (*"3";@U@] $ 1 1& 9 9 *5 '&&))$+*BCCIIKK # ,-"111a4( ,,'1/+"/#1'!5/!5#) ' 
 
"  	5"_44 %")+;"1"?.9,=&5&G"1"?.9

 

 

 
	
r2   )NNNNNNNNNNNNNNNN)rC   rD   rE   r   r  r*  r  r'   r  r  r   Moduler  r  r  r   r   r)   r  r  
BoolTensorrc   r   r   r   r   r   rA   rF   rG   s   @r1   r  r  n  s\         )O23	/ 	 	 	 	 	 	3 3 3: : :4ry 4 4 4 4; ; ;    :>6:8<=A159=7;EI+/-18<$(,0/3&*59#q
 q
#E$56q
 !!23q
 $E$45	q

 !))9 :q
 E-.q
 $E$56q
 'u|4q
 "%e.?(@"ABq
 "%q
 )*q
  (5q
 D>q
 $D>q
 'tnq
  d^!q
" !!12#q
$ 
uU&');;	<%q
 q
 q
 ^q
 q
 q
 q
 q
r2   r  )r   r  r  r   )Hrb   rL  typingr   r   r)   r   activationsr   cache_utilsr   r	   r
   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   utils.deprecationr   configuration_pix2structr   r   r   !torch.nn.attention.flex_attentionr    integrations.flex_attentionr!   
get_loggerrC   rA  r  r#   apex.normalizationrH   infoImportError	Exceptionr  rJ   re   r   r   r   r   r  r   r5  r   r~  r  r  r   r  __all__r   r2   r1   <module>r     s      " " " " " " " "        ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) > > > > > > 9 9 9 9 9 9              . - - - - -                  1 0 0 0 0 0 d d d d d d d d d d  !! K;;;;;;JJJJJJ 
	H	%	%+ + + + +") + + +2	//////&
KKijjjj 	 	 	D 	 	 	
NN_```D	
! ! ! ! ! ! ! !H^ ^ ^ ^ ^	 ^ ^ ^D    ")   :( ( ( ( (6 ( ( (V)
 )
 )
 )
 )
bi )
 )
 )
X y! y! y! y! y! y! y! y!x l
 l
 l
 l
 l
5 l
 l
 l
`    ry   :    BI    T T T T Tbi T T Tp" " " " "ry " " "L$ $ $ $ $	 $ $ $NU+ U+ U+ U+ U+4 U+ U+ U+p   
p p p p p3 p p 
pf   
Q
 Q
 Q
 Q
 Q
)BO Q
 Q
 
Q
h  s   +C	 	C,C,+C,