
     `i                     t   d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z!  ej"        e#          Z$ G d dej%                  Z& G d dej%                  Z' G d dej(                  Z) G d dej(                  Z* G d de          Z+e G d de                      Z, G d de,          Z- ed !           G d" d#e,                      Z. ed$!           G d% d&e,e                      Z/d&dgZ0dS )'z/PyTorch TrOCR decoder model (based on RoBERTa).    N)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )TrOCRConfigc                   h     e Zd ZdZdedef fdZ	 ddej        ded	eej                 f fd
Z	 xZ
S )TrOCRLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 j    d| _         t                                          || j         z   |           d S )N   )offsetsuper__init__)selfr   r   	__class__s      |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/trocr/modeling_trocr.pyr    z(TrOCRLearnedPositionalEmbedding.__init__0   s3     $+5}EEEEE    r   N	input_idspast_key_values_lengthposition_idsc                 0   |V|j         dd         \  }}t          j        |||z   t          j        | j        j                                      |d          }n|                    d          }t                      	                    || j
        z             S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr   )dtypedevicer   )shapetorcharangelongweightr*   expand	unsqueezer   forwardr   )r!   r%   r&   r'   bszseq_lenr"   s         r#   r3   z'TrOCRLearnedPositionalEmbedding.forward6   s    
 $?2A2.LC <&(>(HPUPZcgcncu  fS"oo L (11!44Lww|dk9:::r$   )r   N)__name__
__module____qualname____doc__intr    r-   Tensorr   r3   __classcell__r"   s   @r#   r   r   +   s         Fs F3 F F F F F F pt; ;;?B;V^_d_kVl; ; ; ; ; ; ; ; ; ;r$   r   c            
       \     e Zd ZdZd
dedededee         f fdZdej	        f fd	Z
 xZS )TrOCRScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?r   r   padding_idxembed_scalec                 \    t                                          |||           || _        d S N)r   r    rB   )r!   r   r   rA   rB   r"   s        r#   r    z!TrOCRScaledWordEmbedding.__init__L   s-    DDD&r$   r%   c                 V    t                                          |          | j        z  S rD   )r   r3   rB   )r!   r%   r"   s     r#   r3   z TrOCRScaledWordEmbedding.forwardP   s!    wwy))D,<<<r$   )r@   )r6   r7   r8   r9   r:   r   floatr    r-   r;   r3   r<   r=   s   @r#   r?   r?   G   s         ' 's '3 'S '_ghm_n ' ' ' ' ' '= = = = = = = = = = =r$   r?   c            	            e Zd ZdZddededee         f fdZeddededee         fd            Z e	j
                    dd
e	j        defd            Z	 dd
e	j        dedee         fdZ xZS )"TrOCRSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsr   rA   c                     t                                                       d| _        || _        || _        |                     |||          | _        |                     dt          j	        d                     d S )Nr   _float_tensorr   )
r   r    r   r   rA   get_embeddingweightsregister_bufferr-   FloatTensor)r!   rI   r   rA   r"   s       r#   r    z+TrOCRSinusoidalPositionalEmbedding.__init__W   sp    *&))-TT_e.?.B.BCCCCCr$   r   c                    |dz  }t          j        d          |dz
  z  }t          j        t          j        |t          j                                                  | z            }t          j        | t          j                                                                      d          |                    d          z  }t          j        t          j	        |          t          j
        |          gd                              | d          }|dz  dk    r+t          j        |t          j        | d          gd          }|	d||ddf<   |                    t          j                              S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   )r)   r   dimr+   N)mathlogr-   expr.   int64rF   r2   catsincosviewzerostoget_default_dtype)r   r   rA   half_dimembs        r#   rL   z0TrOCRSinusoidalPositionalEmbedding.get_embedding_   s?    !A%huooA.iXU[AAAGGIISDPQQl>===CCEEOOPQRRUXUbUbcdUeUeei338a@@@EEnVXYY1!!)S%+na"@"@AqIIIC""#CQQQvve-//000r$   r   r%   r&   c                 >   |                                 \  }}|                     || j        |                              |j                  }| j        dz   |z   }| j        || j                             d          k    r&|                     || j        | j                  | _        | j                            | j                  | _        | j        	                    d|
                    d                    
                    ||d                                          }|S )Nr   r   r+   )size"create_position_ids_from_input_idsrA   r\   r*   rM   rL   r   rK   index_selectrZ   detach)r!   r%   r&   r4   r5   r'   max_posxs           r#   r3   z*TrOCRSinusoidalPositionalEmbedding.forwardr   s     ~~''W>>y$JZ\rssvv
 

 "Q&0<7T\->->q-A-A#A#A--gt7I4K[\\DL|t'9::L%%a):):2)>)>??DDS'SUVV]]__r$   c                     |                     |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )z
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
        r   rQ   )ner:   r-   cumsumtype_asr/   )r!   r%   rA   r&   maskincremental_indicess         r#   rb   zETrOCRSinusoidalPositionalEmbedding.create_position_ids_from_input_ids   sg     ||K((,,..$|Da888@@FFI__cgg"''))K77r$   rD   )r   )r6   r7   r8   r9   r:   r   r    staticmethodrL   r-   no_gradr;   r3   rb   r<   r=   s   @r#   rH   rH   T   s(       NND Dc D# DHUXM D D D D D D 1 1c 1# 1HUXM 1 1 1 \1$ U]__  s    _& bc
8 
8
847
8QYZ]Q^
8 
8 
8 
8 
8 
8 
8 
8r$   rH   c                       e Zd ZdZ	 	 	 	 	 	 	 ddededee         d	ee         d
ee         dee         dee         dee         dee         f fdZ e	ddd          	 	 	 	 	 	 dde
j        dee
j                 dee         dee
j                 dee
j                 dee         dee
j                 dee
j        ee
j                 eee
j                          f         fd            Z xZS )TrOCRAttentionz>Multi-headed attention from 'Attention Is All You Need' paper.N        FT	embed_dim	num_headskdimvdimdropout
is_decoderbiasis_cross_attention	layer_idxc                 <   t                                                       || _        ||n|| _        ||n|| _        || _        || _        ||z  | _        | j        |z  | j        k    st          d| j         d| d          | j        dz  | _	        || _
        |
| _        t          j        | j        ||          | _        t          j        | j        ||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rx   )r   r    rr   rt   ru   rs   rv   head_dim
ValueErrorscalingrw   rz   r   Lineark_projv_projq_projout_proj)r!   configrr   rs   rt   ru   rv   rw   rx   ry   rz   r"   s              r#   r    zTrOCRAttention.__init__   s-    	" ,DD)	 ,DD)	"!Y.	)T^;;"dn " "" " "   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr$   past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesattention_masklayer_head_maskoutput_attentionscache_positionreturnc                 d
   |du}|                                 \  }	}
}|                     |          | j        z  }d}|Ht          |t                    r1|j                            | j                  }|r|j        }n
|j	        }n|}|r|n|}|r3|1|r/|j
        | j                 j        }|j
        | j                 j        }n|                     |          }|                     |          }|                    |	d| j        | j                                      dd          }|                    |	d| j        | j                                      dd          }|N|s|nd}|                    ||| j        d|i          \  }}|r$t          |t                    rd|j        | j        <   |	| j        z  d| j        f}|                    |	|
| j        | j                                      dd          } |j        | } |j        | } |j        | }|                     d          }t+          j        ||                    dd                    }|                                 |	| j        z  |
|fk    r2t/          d|	| j        z  |
|f d	|                                            ||                                 |	d|
|fk    r+t/          d
|	d|
|f d	|                                            |                    |	| j        |
|          |z   }|                    |	| j        z  |
|          }t0          j                            |d          }||                                 | j        fk    r-t/          d| j        f d	|                                            |                    dddd          |                    |	| j        |
|          z  }|                    |	| j        z  |
|          }|r=|                    |	| j        |
|          }|                    |	| j        z  |
|          }nd}t0          j                            || j        | j                  }t+          j        ||          }|                                 |	| j        z  |
| j        fk    r5t/          d|	| j        |
| j        f d	|                                            |                    |	| j        |
| j                  }|                    dd          }|                    |	|
|          }|                     |          }||fS )z#Input shape: Batch x Time x ChannelNFr+   r   r   r   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size rQ   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )ra   r   r   
isinstancer   
is_updatedgetrz   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   rZ   rs   r}   	transposeupdatereshaper-   bmmr~   r   
functionalsoftmaxrv   r   r   )r!   r   r   r   r   r   r   r   ry   r4   tgt_lenrr   query_statesr   curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                           r#   r3   zTrOCRAttention.forward   s    .T9"/"4"4"6"6Wi {{=11DL@
&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#-?R))] 	F/"=*"=,3DNCHJ.5dnELLL^44J;;~66L#b$.$-PPZZ[\^_``J',,S"dndmTT^^_`bcddL*7I!St+>+E+Edn?OQ_>`, ,(
L & F*_FY*Z*Z FAEO.t~>DN*B>
#((gt~t}UU__`acdee+|+Z8'Z'4
+|+Z8//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *  
 %""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB&##%%$.)::: 1t~FW 1 1',,..1 1   +//2q!<<|?P?PQTVZVdfmov?w?wwL',,S4>-A7GTTL 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 "&&sDNGT]SS!++Aq11!))#w	BBmmK00111r$   )NNrq   FTFN)NNNNFN)r6   r7   r8   r9   r:   r   rF   boolr    r   r-   r;   r	   tupler3   r<   r=   s   @r#   rp   rp      s       HH #"#&%*#-2$(!C !C !C 	!C
 sm!C sm!C %!C TN!C tn!C %TN!C D>!C !C !C !C !C !CF _%0A6RRR 48+/1526,115q2 q2|q2 #5<0q2 "%	q2
 !.q2 "%,/q2 $D>q2 !.q2 
u|Xel3XeEL>Q5RR	Sq2 q2 q2 SRq2 q2 q2 q2 q2r$   rp   c                   D    e Zd Zddef fdZ eddd          	 	 	 	 	 	 	 	 	 dd
ej        deej                 deej                 deej                 deej                 deej                 dee	         dee
         dee
         deej                 fd            Z xZS )TrOCRDecoderLayerNr   c                    t                                                       |j        | _        t	          || j        |j        |j        d|          | _        |j        | _        t          |j
                 | _        |j        | _        t          j        | j                  | _        |j        rTt	          || j        |j        |j        |j        |j        dd|	  	        | _        t          j        | j                  | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        t          j        | j                  | _        d S )NT)rr   rs   rv   rw   rz   )rr   rs   rt   ru   rv   rw   ry   rz   )r   r    hidden_sizerr   rp   decoder_attention_headsattention_dropout	self_attnrv   r   activation_functionactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normrw   cross_attention_hidden_sizeencoder_attnencoder_attn_layer_normr   decoder_ffn_dimfc1fc2final_layer_norm)r!   r   rz   r"   s      r#   r    zTrOCRDecoderLayer.__init__.  s9   +'n4,
 
 
 ~#F$>?"(";$&L$@$@! 	H .. 8770#'#
! 
! 
!D ,.<+G+GD(9T^V-CDD9V3T^DD "T^ < <r$   r   r   r   r   FTr   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacher   c           	      "   |}|                      ||||||
          \  }}t          j                            || j        | j                  }||z   }|                     |          }d}|g|}|                     |||||||
          \  }}t          j                            || j        | j                  }||z   }|                     |          }|}|                     | 	                    |                    }t          j                            || j
        | j                  }|                     |          }t          j                            || j        | j                  }||z   }|                     |          }|f}|r|||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size *(decoder_attention_heads,)*.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   r   r   r   N)r   r   r   r   r   r   r   )r   r   r   rv   r   r   r   r   r   r   r   r   r   )r!   r   r   r   r   r   r   r   r   r   r   residualself_attn_weightscross_attn_weightsoutputss                  r#   r3   zTrOCRDecoderLayer.forwardR  s   @ ! ,0>>'+)+/) ,: ,
 ,
(( --mt|VZVc-dd =011-@@ " ,$H040A0A+!65 : /"3- 1B 1 1-M- M11-4<Z^Zg1hhM$}4M 88GGM !**488M+B+BCC--mt?Vaean-oo//--mt|VZVc-dd =0--m<< " 	?)+=>>Gr$   rD   )	NNNNNNFTN)r6   r7   r8   r   r    r   r-   r;   r   r	   r   r3   r<   r=   s   @r#   r   r   -  sN       "= "={ "= "= "= "= "= "=H _%0A6RRR 268<9=26=A+/,1$(15Q Q|Q !.Q  (5	Q
 !) 6Q "%,/Q %-U\$:Q "%Q $D>Q D>Q !.Q Q Q SRQ Q Q Q Qr$   r   c                   .    e Zd ZU eed<   dZdZdgZd ZdS )TrOCRPreTrainedModelr   modelTr   c                    | j         j        }t          |t          j        t          j        f          rJ|j        j                            d|           |j	         |j	        j        
                                 d S d S t          |t          j                  rS|j        j                            d|           |j        -|j        j        |j                 
                                 d S d S d S )Nrq   )meanstd)r   init_stdr   r   r   Conv1dr0   datanormal_rx   zero_	EmbeddingrA   )r!   moduler   s      r#   _init_weightsz"TrOCRPreTrainedModel._init_weights  s    k"fry")455 	?M&&CS&999{& &&((((( '&-- 	?M&&CS&999!-"6#56<<>>>>>	? 	?--r$   N)	r6   r7   r8   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr    r$   r#   r   r     sF         &*#,-	? 	? 	? 	? 	?r$   r   c                   J     e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )TrOCRDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TrOCRDecoderLayer`]

    Args:
        config: TrOCRConfig
    r   c                    t                                                     j        | _        j        | _        j        | _        j        rt          j	        j
                  nd}t          j        j
        | j        |          | _        j        r t          j        j
                  | _        n0t%          j        | j        z   dz   j
        | j                  | _        j        rt)          j        j
                  | _        nd | _        t)          j        fdt/          j                  D                       | _        d| _        |                                  d S )Nr@   )rB   r   c                 2    g | ]}t          |           S ))rz   )r   ).0ir   s     r#   
<listcomp>z)TrOCRDecoder.__init__.<locals>.<listcomp>  s(    $r$r$rPQ%6v%K%K%K$r$r$rr$   F)r   r    rv   decoder_layerdrop	layerdroppad_token_idrA   scale_embeddingrS   sqrtr   r?   
vocab_sizeembed_tokensuse_learned_position_embeddingsr   max_position_embeddingsembed_positionsrH   layernorm_embeddingr   r   
ModuleListrangedecoder_layersr   gradient_checkpointing	post_init)r!   r   rB   r"   s    ` r#   r    zTrOCRDecoder.__init__  s\      ~1!.7=7MVdi 2333SV4v143CQ\
 
 
 1 	#B6Cacicu#v#vD  #E.1AAAE" $ $D  % 	,')|F4F'G'GD$$'+D$m$r$r$r$rUZ[a[pUqUq$r$r$rss&+#r$   Nc                    |
|
n| j         j        }
||n| j         j        }|	|	n| j         j        }	||n| j         j        }||t          d          |$|}|                    d|j        d                   }n=|,|                                dd         }|dddddf         }nt          d          | j	        r%| j
        r|	rt                              d           d}	|	rO|M|6t          t          | j                   t          | j                             nt          | j                   }|	rCt          |t                     r.t                              d           t          j        |          }||                                nd	}||                     |          }| j         j        r|                     ||
          }n|                     ||
          }||z   }| j        |                     |          }t.          j                            || j        | j
                  }|j        }t5          ||||          }||t7          ||j        |d                   }|rdnd}|
rdnd}|
r|dnd}t;          ||gddg          D ]z\  }}|s|                                d	         t=          | j                  k    rCt          d| dt=          | j                   d|                                d	          d          {tA          | j                  D ]\  }}|r||fz  }| j
        r tC          j"        g           }|| j#        k     r4 ||||||||         nd|||         nd||
|	|
  
        }|d	         }|
r||d         fz  }|||d         fz  }|r||fz  }|st!          d |||||fD                       S tI          |||||          S )a;  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
                on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer+   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz^`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...F)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r&   r   )r   r   	head_maskcross_attn_head_maskzThe `z` should be specified for z layers, but it is for .)r   r   r   r   r   r   r   r   r   c              3      K   | ]}||V  	d S rD   r   )r   vs     r#   	<genexpr>z'TrOCRDecoder.forward.<locals>.<genexpr>  s0        =  === r$   )last_hidden_stater   r   
attentionscross_attentions)%r   r   output_hidden_statesr   use_return_dictr~   rZ   r,   ra   r   r   loggerwarning_oncer   r
   r   r   from_legacy_cacheget_seq_lengthr   r   r   r   r   r   rv   r   r   r)   ziplenr   	enumerater-   randr   r   )r!   r%   r   r   r   r   r   r   inputs_embedsr   r   r  return_dictr   inputinput_shaper&   	embed_posr   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputss                               r#   r3   zTrOCRDecoder.forward  s:   ^ 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]  ]%>sttt"E!r5;r?;;II&',,..ss3K!!!!QQQ(+EEdeee& 	"4= 	" "##t   "	 	0 )4 $L$D$D$DlZ^ZeFfFfFfggg!555 
  	UOU;; 	U\  
 2COTTOETE`!?!?!A!A!Afg  --i88M;6 	g,,UKa,bbII,,YOe,ffI%	1#/ 44]CCM--mt|VZVc-ddk:K8N
 

 !,1G1S%?&(;[QS_& & &"
 #7@BBD0:d&7h<Q<]rrdh %(4H(IKYoKp$q$q 	 	 Iy$>>##A&3t{+;+;<<$3	 3 3SEUEU 3 3%NN,,Q/3 3 3   #,DK"8"8 	@ 	@C# 6!m%55!} &+jnn#&77)M%'=3<3H3dI]Ii,@,E,Eos /"3#-  M *!,M  @=#3"55(4(]1-=,??(   	2-!11 	  ':K^]qr     
 9+++%1
 
 
 	
r$   )NNNNNNNNNNNNN)r6   r7   r8   r9   r   r    r3   r<   r=   s   @r#   r   r     s         {      B "#!!O
 O
 O
 O
 O
 O
 O
 O
r$   r   a  
    The TrOCR Model with a language modeling head. Can be used for summarization.
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    )custom_introc                   $     e Zd Z fdZd Z xZS )TrOCRDecoderWrapperc                 r    t                                          |           t          |          | _        d S rD   )r   r    r   decoderr!   r   r"   s     r#   r    zTrOCRDecoderWrapper.__init__  s.       #F++r$   c                      | j         |i |S rD   )r  )r!   argskwargss      r#   r3   zTrOCRDecoderWrapper.forward  s    t|T,V,,,r$   )r6   r7   r8   r    r3   r<   r=   s   @r#   r  r    sG        , , , , ,- - - - - - -r$   r  zy
    The TrOCR Decoder with a language modeling head. Can be used as the decoder part of [`EncoderDecoderModel`] and
    c            "           e Zd ZdgZ fdZd Zd Zd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 dee         deej                 deej                 dee         dee         dee         dee         deej                 deeef         fd            Z xZS )TrOCRForCausalLMzoutput_projection.weightc                    d|_         d|_        t                                          |           t	          |          | _        t          j        |j        |j	        d          | _
        |                                  d S )NTFr|   )rw   is_encoder_decoderr   r    r  r   r   r   r   r   output_projectionr   r   s     r#   r    zTrOCRForCausalLM.__init__  sr     $)!   (00
!#6+=v?PW\!]!]!] 	r$   c                 $    | j         j        j        S rD   r   r  r   r!   s    r#   get_input_embeddingsz%TrOCRForCausalLM.get_input_embeddings  s    z!..r$   c                 (    || j         j        _        d S rD   r*  )r!   values     r#   set_input_embeddingsz%TrOCRForCausalLM.set_input_embeddings  s    */
'''r$   c                     | j         S rD   r(  r+  s    r#   get_output_embeddingsz&TrOCRForCausalLM.get_output_embeddings  s    %%r$   c                     || _         d S rD   r1  )r!   new_embeddingss     r#   set_output_embeddingsz&TrOCRForCausalLM.set_output_embeddings  s    !/r$   c                     || j         _        d S rD   r   r  )r!   r  s     r#   set_decoderzTrOCRForCausalLM.set_decoder  s    $
r$   c                     | j         j        S rD   r7  r+  s    r#   get_decoderzTrOCRForCausalLM.get_decoder  s    z!!r$   Nr%   r   r   r   r   r   r   r  labelsr   r   r  r  r   r   c                 
   ||n| j         j        }||n| j         j        }||n| j         j        }| j                            |||||||||
||||          }|                     |d                   }d}|	Kt                      } ||                    d| j         j	                  |	                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j        |j                  S )a
  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import (
        ...     TrOCRConfig,
        ...     TrOCRProcessor,
        ...     TrOCRForCausalLM,
        ...     ViTConfig,
        ...     ViTModel,
        ...     VisionEncoderDecoderModel,
        ... )
        >>> import requests
        >>> from PIL import Image

        >>> # TrOCR is a decoder model and should be used within a VisionEncoderDecoderModel
        >>> # init vision2text model with random weights
        >>> encoder = ViTModel(ViTConfig())
        >>> decoder = TrOCRForCausalLM(TrOCRConfig())
        >>> model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)

        >>> # If you want to start from the pretrained model, load the checkpoint with `VisionEncoderDecoderModel`
        >>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
        >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

        >>> # load image from the IAM dataset
        >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        >>> pixel_values = processor(image, return_tensors="pt").pixel_values
        >>> text = "industry, ' Mr. Brown commented icily. ' Let us have a"

        >>> # training
        >>> model.config.decoder_start_token_id = processor.tokenizer.eos_token_id
        >>> model.config.pad_token_id = processor.tokenizer.pad_token_id
        >>> model.config.vocab_size = model.config.decoder.vocab_size

        >>> labels = processor.tokenizer(text, return_tensors="pt").input_ids
        >>> outputs = model(pixel_values, labels=labels)
        >>> loss = outputs.loss
        >>> round(loss.item(), 2)
        5.30

        >>> # inference
        >>> generated_ids = model.generate(pixel_values)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> generated_text
        'industry, " Mr. Brown commented icily. " Let us have a'
        ```N)r%   r   r   r   r   r   r   r  r   r   r  r  r   r   r+   r   )losslogitsr   r   r  r  )r   r   r  r  r   r  r(  r   rZ   r   r   r   r   r  r  )r!   r%   r   r   r   r   r   r   r  r;  r   r   r  r  r   r   r>  r=  loss_fctoutputs                       r#   r3   zTrOCRForCausalLM.forward  sW   Z 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] *$$)"7#9!5+'/!5#) % 
 
  ''
33'))H8FKKDK,BCCV[[QS__UUD 	DY,F'+'7D7V##VC0#3!/)$5
 
 
 	
r$   )NNNNNNNNNNNNNN)r6   r7   r8   _tied_weights_keysr    r,  r/  r2  r5  r8  r:  r   r   r-   
LongTensorr;   rO   r	   r   r   r   r   r3   r<   r=   s   @r#   r%  r%    s        55	 	 	 	 	/ / /0 0 0& & &0 0 0% % %" " "  1515=A=A,07;+/59-1$(,0/3&*15u
 u
E,-u
 !.u
  ((9:	u

 !))9 :u
 EL)u
 'u|4u
 "%u
   12u
 )*u
 D>u
 $D>u
 'tnu
 d^u
 !.u
  
u77	8!u
 u
 u
 ^u
 u
 u
 u
 u
r$   r%  )1r9   rS   typingr   r   r-   r   torch.nnr   activationsr   cache_utilsr	   r
   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_trocrr   
get_loggerr6   r  r   r   r?   ModulerH   rp   r   r   r   r  r%  __all__r   r$   r#   <module>rR     sg   6 5  " " " " " " " "        % % % % % % ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) )        : 9 9 9 9 9 l l l l l l l l - - - - - - , , , , , , , , 0 0 0 0 0 0 , , , , , , 
	H	%	%; ; ; ; ;bl ; ; ;8
= 
= 
= 
= 
=r| 
= 
= 
=;8 ;8 ;8 ;8 ;8 ;8 ;8 ;8|X2 X2 X2 X2 X2RY X2 X2 X2vw w w w w2 w w wt ? ? ? ? ?? ? ? ?$v
 v
 v
 v
 v
' v
 v
 v
r   - - - - -. - - -   
V
 V
 V
 V
 V
+_ V
 V
 
V
r 5
6r$   