
     `ix              	       N   d Z ddlZddlZddlmZ ddlmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZmZ ddlmZ  ej        e          Ze ed           G d de                                  Ze ed           G d de                                  Z e ed           G d de                                  Z!d Z"d Z# G d de
j$                  Z% G d de
j$                  Z& G d de
j$                  Z'dAd"e	j(        d#e)d$e*d%e	j(        fd&Z+ G d' d(e
j$                  Z, G d) d*e
j$                  Z- G d+ d,e
j$                  Z. G d- d.e
j$                  Z/ G d/ d0e
j$                  Z0 G d1 d2e
j$                  Z1 G d3 d4e
j$                  Z2 G d5 d6e          Z3 G d7 d8e
j$                  Z4e G d9 d:e                      Z5e G d; d<e5                      Z6 ed=           G d> d?e5                      Z7g d@Z8dS )BzPyTorch Donut Swin Transformer model.

This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
states.    N)	dataclass)OptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringlogging	torch_int   )DonutSwinConfigzS
    DonutSwin encoder's outputs, with potential hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	ee
ej        df                  ed<   dZee
ej        df                  ed<   dZee
ej        df                  ed<   dS )DonutSwinEncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tupler   r        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/donut/modeling_donut_swin.pyr   r   '   s           6:x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>FJHU5+<c+A%BCJJJJJr$   r   z[
    DonutSwin model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )	DonutSwinModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_output.r   r   r   )r   r   r   r   r   r   r   r    r!   r(   r   r"   r   r   r#   r$   r%   r'   r'   >   s         	 	 6:x 1299915M8E-.555=AM8E%"3S"89:AAA:>Ju0#567>>>FJHU5+<c+A%BCJJJJJr$   r'   z5
    DonutSwin outputs for image classification.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )	DonutSwinImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlosslogits.r   r   r   )r   r   r   r   r+   r   r   r    r!   r,   r   r"   r   r   r#   r$   r%   r*   r*   X   s           )-D(5$
%,,,*.FHU&'...=AM8E%"3S"89:AAA:>Ju0#567>>>FJHU5+<c+A%BCJJJJJr$   r*   c                     | j         \  }}}}|                     |||z  |||z  ||          } |                     dddddd                                                              d|||          }|S )z2
    Partitions the given input into windows.
    r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowss          r%   window_partitionr>   u   s     /<.A+J|!&&Fk);8Lk[g M ##Aq!Q155@@BBGGKYdfrssGNr$   c                     | j         d         }|                     d||z  ||z  |||          } |                     dddddd                                                              d|||          } | S )z?
    Merges windows to produce higher resolution features.
    r1   r   r   r   r.   r/   r0   r2   )r=   r8   r:   r;   r<   s        r%   window_reverser@      sx     =$Lll2v4e{6JKYdfrssGooaAq!Q//::<<AA"feUabbGNr$   c            
            e Zd ZdZd fd	Zdej        dededej        fdZ	 	 dd
e	ej
                 de	ej                 dedeej                 fdZ xZS )DonutSwinEmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    Fc                 <   t                                                       t          |          | _        | j        j        }| j        j        | _        |r-t          j        t          j
        dd|j                            nd | _        |j        r6t          j        t          j
        d|dz   |j                            | _        nd | _        t          j        |j                  | _        t          j        |j                  | _        |j        | _        || _        d S )Nr   )super__init__DonutSwinPatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)selfrW   use_mask_tokenrH   	__class__s       r%   rE   zDonutSwinEmbeddings.__init__   s     8 @ @+7/9O]g",u{1a9I'J'JKKKcg) 	,')|EK;QR?TZTd4e4e'f'fD$$'+D$L!122	z&"<== +r$   
embeddingsr:   r;   returnc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr1   g      ?r   r   r.   bicubicF)sizemodealign_cornersdim)r3   rP   r   jit
is_tracingrV   r   reshaper5   r   
functionalinterpolater4   cat)rX   r[   r:   r;   rH   num_positionsclass_pos_embedpatch_pos_embedrc   
new_height	new_widthsqrt_num_positionss               r%   interpolate_pos_encodingz,DonutSwinEmbeddings.interpolate_pos_encoding   sr    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++2111bqb592111abb59r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr$   Npixel_valuesbool_masked_posrp   c                    |j         \  }}}}|                     |          \  }}	|                     |          }|                                \  }
}}|R| j                            |
|d          }|                    d                              |          }|d|z
  z  ||z  z   }| j        '|r|| 	                    |||          z   }n
|| j        z   }| 
                    |          }||	fS )Nr1         ?)r3   rG   rR   r_   rN   expand	unsqueezetype_asrP   rp   rU   )rX   rq   rr   rp   _r<   r:   r;   r[   output_dimensionsr9   seq_lenmask_tokensmasks                 r%   forwardzDonutSwinEmbeddings.forward   s	    *6);&<(,(=(=l(K(K%
%YYz**
!+!2!2
GQ&/00WbIIK",,R0088EED#sTz2[45GGJ#/' C'$*G*G
TZ\a*b*bb

'$*BB
\\*--
,,,r$   )F)NF)r   r   r   r   rE   r   Tensorintrp   r   r    
BoolTensorboolr"   r}   __classcell__rZ   s   @r%   rB   rB      s              &&D5< &D &DUX &D]b]i &D &D &D &DV 7;).	- -u01- "%"23- #'	-
 
u|	- - - - - - - -r$   rB   c                   t     e Zd ZdZ fdZd Zdeej                 de	ej
        e	e         f         fdZ xZS )rF   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        |d         |d         z  |d         |d         z  f| _        t          j        ||||          | _        d S )Nr   r   )kernel_sizestride)rD   rE   
image_sizerV   r<   rM   
isinstancecollectionsabcIterablerH   rI   r   Conv2d
projection)rX   rW   r   rV   r<   hidden_sizerH   rZ   s          r%   rE   z!DonutSwinPatchEmbeddings.__init__   s   !'!2F4EJ
$*$79Ik#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY)L+:^hiiir$   c                 Z   || j         d         z  dk    r@d| j         d         || j         d         z  z
  f}t          j                            ||          }|| j         d         z  dk    rBddd| j         d         || j         d         z  z
  f}t          j                            ||          }|S )Nr   r   )rV   r   rg   pad)rX   rq   r:   r;   
pad_valuess        r%   	maybe_padz"DonutSwinPatchEmbeddings.maybe_pad   s    4?1%%**T_Q/%$/!:L2LLMJ=,,\:FFLDOA&&!++Q4?1#5QRAS8S#STJ=,,\:FFLr$   rq   r\   c                     |j         \  }}}}|                     |||          }|                     |          }|j         \  }}}}||f}|                    d                              dd          }||fS )Nr.   r   )r3   r   r   flatten	transpose)rX   rq   rx   r<   r:   r;   r[   ry   s           r%   r}   z DonutSwinPatchEmbeddings.forward	  s    )5);&<~~lFEBB__\22
(.1fe#UO''**44Q::
,,,r$   )r   r   r   r   rE   r   r   r   r    r"   r~   r   r}   r   r   s   @r%   rF   rF      s         j j j j j  	-HU->$? 	-E%,X]^aXbJbDc 	- 	- 	- 	- 	- 	- 	- 	-r$   rF   c            	            e Zd ZdZej        fdee         dedej        ddf fdZ	d Z
d	ej        d
eeef         dej        fdZ xZS )DonutSwinPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    input_resolutionrc   
norm_layerr\   Nc                     t                                                       || _        || _        t	          j        d|z  d|z  d          | _         |d|z            | _        d S )Nr/   r.   Fbias)rD   rE   r   rc   r   Linear	reductionrR   )rX   r   rc   r   rZ   s       r%   rE   zDonutSwinPatchMerging.__init__#  sa     01s7AG%@@@Jq3w''			r$   c                     |dz  dk    p|dz  dk    }|r.ddd|dz  d|dz  f}t           j                            ||          }|S )Nr.   r   r   )r   rg   r   )rX   r7   r:   r;   
should_padr   s         r%   r   zDonutSwinPatchMerging.maybe_pad*  s\    qjAo:519>
 	IQ519a!<JM--mZHHMr$   r7   input_dimensionsc                    |\  }}|j         \  }}}|                    ||||          }|                     |||          }|d d dd ddd dd d f         }|d d dd ddd dd d f         }	|d d dd ddd dd d f         }
|d d dd ddd dd d f         }t          j        ||	|
|gd          }|                    |dd|z            }|                     |          }|                     |          }|S )Nr   r.   r   r1   r/   )r3   r4   r   r   ri   rR   r   )rX   r7   r   r:   r;   r9   rc   r<   input_feature_0input_feature_1input_feature_2input_feature_3s               r%   r}   zDonutSwinPatchMerging.forward2  sD   ((5(;%
C%**:vulSS}feDD'14a4Aqqq(89'14a4Aqqq(89'14a4Aqqq(89'14a4Aqqq(89	?O_Ve"fhjkk%**:r1|;KLL		-00}55r$   )r   r   r   r   r   rQ   r"   r   ModulerE   r   r   r~   r}   r   r   s   @r%   r   r     s        
 
 XZWc ( (s (# (29 (hl ( ( ( ( ( (  U\ U3PS8_ Y^Ye        r$   r           Finput	drop_probtrainingr\   c                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   dtypedevice)r3   ndimr   randr   r   floor_div)r   r   r   	keep_probr3   random_tensoroutputs          r%   	drop_pathr   M  s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr$   c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
DonutSwinDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r\   c                 V    t                                                       || _        d S N)rD   rE   r   )rX   r   rZ   s     r%   rE   zDonutSwinDropPath.__init__e  s$    "r$   r   c                 8    t          || j        | j                  S r   )r   r   r   rX   r   s     r%   r}   zDonutSwinDropPath.forwardi  s    FFFr$   c                     d| j          S )Nzp=)r   rX   s    r%   
extra_reprzDonutSwinDropPath.extra_reprl  s    $DN$$$r$   r   )r   r   r   r   r   floatrE   r   r~   r}   strr   r   r   s   @r%   r   r   b  s        bb# #(5/ #T # # # # # #GU\ Gel G G G G%C % % % % % % % %r$   r   c                        e Zd Z fdZ	 	 	 d
dej        deej                 deej                 dee         de	ej                 f
d	Z
 xZS )DonutSwinSelfAttentionc                    t                                                       ||z  dk    rt          d| d| d          || _        t	          ||z            | _        | j        | j        z  | _        t          |t          j	        j
                  r|n||f| _        t          j        t          j        d| j        d         z  dz
  d| j        d         z  dz
  z  |                    | _        t          j        | j        d                   }t          j        | j        d                   }t          j        t'          ||gd                    }t          j        |d          }|d d d d d f         |d d d d d f         z
  }	|	                    ddd                                          }	|	d d d d dfxx         | j        d         dz
  z  cc<   |	d d d d dfxx         | j        d         dz
  z  cc<   |	d d d d dfxx         d| j        d         z  dz
  z  cc<   |	                    d	          }
|                     d
|
           t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        |j                  | _         d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r.   r   ij)indexingr1   relative_position_indexr   )!rD   rE   
ValueErrornum_attention_headsr   attention_head_sizeall_head_sizer   r   r   r   r8   r   rK   r   rL   relative_position_bias_tablearangestackr   r   r5   r6   sumregister_bufferr   qkv_biasquerykeyvaluerS   attention_probs_dropout_probrU   )rX   rW   rc   	num_headsr8   coords_hcoords_wcoordscoords_flattenrelative_coordsr   rZ   s              r%   rE   zDonutSwinSelfAttention.__init__r  s   ?akCkk_hkkk   $- #&sY#7#7 !58PP%k;?3KLLlKKS^`kRl 	 -/LKT-a0014T=Ma=P9PST9TUW`aa-
 -
)
 < 0 344< 0 344Xx&:TJJJKKvq11(AAAt4~aaaqqqj7QQ)11!Q::EEGG111a   D$4Q$7!$;;   111a   D$4Q$7!$;;   111a   A(8(;$;a$??   "1"5"5b"9"968OPPPYt143EFO\\\
9T/1C&/ZZZYt143EFO\\\
z&"EFFr$   NFr   attention_mask	head_maskoutput_attentionsr\   c                    |j         \  }}}||d| j        f}|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }t          j        |	|
                    dd                    }|t          j
        | j                  z  }| j        | j                            d                   }|                    | j        d         | j        d         z  | j        d         | j        d         z  d          }|                    ddd                                          }||                    d          z   }|v|j         d         }|                    ||z  || j        ||          }||                    d                              d          z   }|                    d| j        ||          }t$          j                            |d          }|                     |          }|||z  }t          j        ||          }|                    dddd                                          }|                                d d         | j        fz   }|                    |          }|r||fn|f}|S )Nr1   r   r.   r   rb   r   )r3   r   r   r4   r   r   r   r   matmulmathsqrtr   r   r8   r5   r6   rv   r   r   rg   softmaxrU   r_   r   )rX   r   r   r   r   r9   rc   r<   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r%   r}   zDonutSwinSelfAttention.forward  s    )6(;%
C"CT-EFjj//44\BBLLQPQRRHH]++00>>HHANN	jj//44\BBLLQPQRR !<Y5H5HR5P5PQQ+di8P.Q.QQ!%!B4C_CdCdegChCh!i!7!<!<Q$"21"55t7G7JTM]^_M`7`bd"
 "
 "8!?!?1a!H!H!S!S!U!U+.D.N.Nq.Q.QQ%'-a0J/44j(*d6NPSUX     0.2J2J12M2M2W2WXY2Z2ZZ/44R9QSVX[\\ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]r$   NNF)r   r   r   rE   r   r~   r   r    r   r"   r}   r   r   s   @r%   r   r   q  s        #G #G #G #G #GP 7;15,16 6|6 !!236 E-.	6
 $D>6 
u|	6 6 6 6 6 6 6 6r$   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )DonutSwinSelfOutputc                     t                                                       t          j        ||          | _        t          j        |j                  | _        d S r   )rD   rE   r   r   denserS   r   rU   rX   rW   rc   rZ   s      r%   rE   zDonutSwinSelfOutput.__init__  sD    YsC((
z&"EFFr$   r   input_tensorr\   c                 Z    |                      |          }|                     |          }|S r   r   rU   )rX   r   r   s      r%   r}   zDonutSwinSelfOutput.forward  s*    

=11]33r$   r   r   r   rE   r   r~   r}   r   r   s   @r%   r   r     sn        G G G G G
U\  RWR^        r$   r   c                        e Zd Z fdZd Z	 	 	 ddej        deej                 deej                 dee	         d	e
ej                 f
d
Z xZS )DonutSwinAttentionc                     t                                                       t          ||||          | _        t	          ||          | _        t                      | _        d S r   )rD   rE   r   rX   r   r   setpruned_heads)rX   rW   rc   r   r8   rZ   s        r%   rE   zDonutSwinAttention.__init__  sQ    *63	;OO	)&#66EEr$   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rb   )lenr   rX   r   r   r  r   r   r   r   r   r   r   union)rX   headsindexs      r%   prune_headszDonutSwinAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r$   NFr   r   r   r   r\   c                     |                      ||||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )rX   r   )rX   r   r   r   r   self_outputsattention_outputr   s           r%   r}   zDonutSwinAttention.forward  sO     yy	K\]];;|AFF#%QRR(88r$   r   )r   r   r   rE   r  r   r~   r   r    r   r"   r}   r   r   s   @r%   r   r     s        " " " " "; ; ;* 7;15,1
 
|
 !!23
 E-.	

 $D>
 
u|	
 
 
 
 
 
 
 
r$   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )DonutSwinIntermediatec                 $   t                                                       t          j        |t	          |j        |z                      | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )rD   rE   r   r   r   	mlp_ratior   r   
hidden_actr   r   intermediate_act_fnr   s      r%   rE   zDonutSwinIntermediate.__init__  sx    YsC(83(>$?$?@@
f'-- 	9'-f.?'@D$$$'-'8D$$$r$   r   r\   c                 Z    |                      |          }|                     |          }|S r   )r   r  r   s     r%   r}   zDonutSwinIntermediate.forward  s,    

=1100??r$   r   r   s   @r%   r  r    s^        9 9 9 9 9U\ el        r$   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )DonutSwinOutputc                     t                                                       t          j        t	          |j        |z            |          | _        t          j        |j                  | _	        d S r   )
rD   rE   r   r   r   r  r   rS   rT   rU   r   s      r%   rE   zDonutSwinOutput.__init__  sT    Ys6#3c#9::C@@
z&"<==r$   r   r\   c                 Z    |                      |          }|                     |          }|S r   r   r   s     r%   r}   zDonutSwinOutput.forward  s*    

=11]33r$   r   r   s   @r%   r  r    s^        > > > > >
U\ el        r$   r  c                        e Zd Zd fd	Zd Zd Zd Z	 	 	 dd	ej        d
e	e
e
f         deej                 dee         dee         de	ej        ej        f         fdZ xZS )DonutSwinLayerr   r   c                    t                                                       |j        | _        || _        |j        | _        || _        t          j        ||j                  | _	        t          |||| j                  | _        |dk    rt          |          nt          j                    | _        t          j        ||j                  | _        t!          ||          | _        t%          ||          | _        d S )N)eps)r8   r   )rD   rE   chunk_size_feed_forward
shift_sizer8   r   r   rQ   layer_norm_epslayernorm_beforer   	attentionr   Identityr   layernorm_afterr  intermediater  r   )rX   rW   rc   r   r   drop_path_rater  rZ   s          r%   rE   zDonutSwinLayer.__init__$  s    '-'E$$!- 0 "Sf6K L L L+FCPTP`aaa>Ls>R>R*>:::XZXcXeXe!|CV5JKKK1&#>>%fc22r$   c                    t          |          | j        k    rnt          d          | _        t          j                                        r&t	          j         t	          j        |                    nt          |          | _        d S d S Nr   )minr8   r   r  r   rd   re   tensor)rX   r   s     r%   set_shift_and_window_sizez(DonutSwinLayer.set_shift_and_window_size1  sv      D$444'llDO=BY=Q=Q=S=Sn	%,'788999Y\]mYnYn  54r$   c           	         | j         dk    r]t          j        d||df||          }t          d| j                   t          | j         | j                    t          | j          d           f}t          d| j                   t          | j         | j                    t          | j          d           f}d}|D ]}	|D ]}
||d d |	|
d d f<   |dz  }t          || j                  }|                    d| j        | j        z            }|                    d          |                    d          z
  }|                    |dk    d                              |dk    d          }nd }|S )Nr   r   r   r1   r.   g      Yr   )	r  r   rL   slicer8   r>   r4   rv   masked_fill)rX   r:   r;   r   r   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r%   get_attn_maskzDonutSwinLayer.get_attn_mask9  s   ?Q{Avua#8fUUUHa$**++t''$/)9::t&--M a$**++t''$/)9::t&--L
 E -  #/  K@EHQQQk111<=QJEE ,Hd6FGGL',,R1ADDT1TUUL$..q11L4J4J14M4MMI!--i1nfEEQQR[_`R`beffIIIr$   c                     | j         || j         z  z
  | j         z  }| j         || j         z  z
  | j         z  }ddd|d|f}t          j                            ||          }||fS r$  )r8   r   rg   r   )rX   r   r:   r;   	pad_right
pad_bottomr   s          r%   r   zDonutSwinLayer.maybe_padU  sp    %0@(@@DDTT	&$2B)BBdFVV
Ay!Z8
))-DDj((r$   NFr   r   r   r   always_partitionr\   c                    |s|                      |           n	 |\  }}|                                \  }}	}
|}|                     |          }|                    ||||
          }|                     |||          \  }}|j        \  }	}}}	| j        dk    r&t          j        || j         | j         fd          }n|}t          || j
                  }|                    d| j
        | j
        z  |
          }|                     |||j        |j                  }|                     ||||          }|d         }|                    d| j
        | j
        |
          }t          || j
        ||          }| j        dk    r$t          j        || j        | j        fd          }n|}|d         dk    p|d         dk    }|r&|d d d |d |d d f                                         }|                    |||z  |
          }||                     |          z   }|                     |          }|                     |          }||                     |          z   }|r
||d	         fn|f}|S )
Nr   )r   r.   )shiftsdimsr1   r   )r   r   r0   r   )r'  r_   r  r4   r   r3   r  r   rollr>   r8   r3  r   r   r  r@   r6   r   r   r!  r   )rX   r   r   r   r   r7  r:   r;   r9   rx   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr2  attention_outputsr
  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                            r%   r}   zDonutSwinLayer.forward\  s      	**+;<<<<("/"4"4"6"6
Ax --m<<%**:vuhOO %)NN=&%$P$P!z&3&9#:y!?Q$)J}tFVY]YhXhEipv$w$w$w!!$1! !11FHX Y Y 5 : :2t?ORVRb?bdl m m&&	)<EZEa ' 
 
	 !NN!9iK\ + 
 
 -Q/,11"d6FHXZbcc():D<LjZcdd ?Q %
?DOUYUdCelr s s s /]Q&;*Q-!*;
 	V 1!!!WfWfufaaa2G H S S U U-22:v~xXX 4>>2C#D#DD++M::((66$t{{<'@'@@@Qf'8';<<XdWfr$   )r   r   NFF)r   r   r   rE   r'  r3  r   r   r~   r"   r   r   r    r   r}   r   r   s   @r%   r  r  #  s        3 3 3 3 3 3    8) ) ) 26,1+0A A|A  S/A E-.	A
 $D>A #4.A 
u|U\)	*A A A A A A A Ar$   r  c                        e Zd Z fdZ	 	 	 ddej        deeef         deej	                 dee
         dee
         d	eej                 fd
Z xZS )DonutSwinStagec                 6   t                                                       | _        | _        t	          j        fdt          |          D                       | _        | |t          j                  | _	        nd | _	        d| _
        d S )Nc                 l    g | ]0}t          |         |d z  dk    rdn	j        d z            1S )r.   r   )rW   rc   r   r   r"  r  )r  r8   ).0irW   rc   r   r   r   s     r%   
<listcomp>z+DonutSwinStage.__init__.<locals>.<listcomp>  sh     
 
 
  !%5'#,Q<%&UaZZqqf6HA6M  
 
 
r$   )rc   r   F)rD   rE   rW   rc   r   
ModuleListrangeblocksrQ   
downsamplepointing)	rX   rW   rc   r   depthr   r   rS  rZ   s	    ``` `` r%   rE   zDonutSwinStage.__init__  s    m
 
 
 
 
 
 
 
 u
 
 

 
 !(j)9sr|\\\DOO"DOr$   NFr   r   r   r   r7  r\   c                 *   |\  }}t          | j                  D ](\  }}	|||         nd }
 |	|||
||          }|d         })|}| j        -|dz   dz  |dz   dz  }}||||f}|                     ||          }n||||f}|||f}|r||dd          z  }|S )Nr   r   r.   )	enumeraterR  rS  )rX   r   r   r   r   r7  r:   r;   rN  layer_modulelayer_head_maskrG  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledry   stage_outputss                    r%   r}   zDonutSwinStage.forward  s     )(55 	- 	-OA|.7.CillO(L/BSUe M *!,MM,9)?&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_``MM!' >&(IK\] 	/]122..Mr$   rH  )r   r   r   rE   r   r~   r"   r   r   r    r   r}   r   r   s   @r%   rJ  rJ    s            < 26,1+0 |  S/ E-.	
 $D> #4. 
u|	       r$   rJ  c                        e Zd Z fdZ	 	 	 	 	 	 ddej        deeef         deej	                 dee
         d	ee
         d
ee
         dee
         dee
         deeef         fdZ xZS )DonutSwinEncoderc                     t                                                       t          j                   _         _        d t          j        dj        t          j                  d          D             t          j         fdt           j                  D                        _        d _        d S )Nc                 6    g | ]}|                                 S r#   )item)rM  xs     r%   rO  z-DonutSwinEncoder.__init__.<locals>.<listcomp>  s     lllAqvvxxlllr$   r   cpu)r   c                 t   g | ]}t          t          j        d |z  z            d         d |z  z  d         d |z  z  fj        |         j        |         t          j        d|                   t          j        d|dz                               |j        dz
  k     rt          nd          S )r.   r   r   N)rW   rc   r   rU  r   r   rS  )rJ  r   rM   depthsr   r   
num_layersr   )rM  i_layerrW   dprrI   rX   s     r%   rO  z-DonutSwinEncoder.__init__.<locals>.<listcomp>  s         !F,q'z9::&/lq'z&BIaLUVX_U_D`%a -0$.w7!#fmHWH&=">">V]S`U\_`U`S`EaAbAb"bc9@4?UVCV9V9V44]a    r$   F)rD   rE   r  rf  rg  rW   r   linspacer"  r   r   rP  rQ  layersgradient_checkpointing)rX   rW   rI   ri  rZ   s   ```@r%   rE   zDonutSwinEncoder.__init__  s    fm,,ll63H#fmJ\J\ej!k!k!klllm        %T_55  
 
 ',###r$   NFTr   r   r   r   output_hidden_states(output_hidden_states_before_downsamplingr7  return_dictr\   c	                    |rdnd }	|rdnd }
|rdnd }|r?|j         \  }}} |j        |g||R  }|                    dddd          }|	|fz  }	|
|fz  }
t          | j                  D ]\  }}|||         nd } ||||||          }|d         }|d         }|d         }|d         |d         f}|rP|rN|j         \  }}} |j        |g|d         |d         f|R  }|                    dddd          }|	|fz  }	|
|fz  }
nC|rA|s?|j         \  }}} |j        |g||R  }|                    dddd          }|	|fz  }	|
|fz  }
|r||dd          z  }|st          d ||	|fD                       S t          ||	||
	          S )
Nr#   r   r   r   r.   r   r1   c              3      K   | ]}||V  	d S r   r#   )rM  vs     r%   	<genexpr>z+DonutSwinEncoder.forward.<locals>.<genexpr>/  s(      mmq_`_l_l_l_l_lmmr$   )r   r   r   r   )r3   r4   r5   rW  rk  r"   r   )rX   r   r   r   r   rm  rn  r7  ro  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr9   rx   r   reshaped_hidden_staterN  rX  rY  rG  rZ  ry   s                         r%   r}   zDonutSwinEncoder.forward  s    #7@BBD+?%IRRT"$5?bb4 	C)6)<&J;$6M$6z$bDT$bVa$b$b$b!$9$A$A!Q1$M$M!-!11&+@*BB&(55  	9  	9OA|.7.CillO(L/BSUe M *!,M0=a0@- -a 0 1" 57H7LM# G(P G-N-T*
A{ )O(I(N)"3A"68I!8L!M)OZ) ) )% )>(E(EaAq(Q(Q%!&G%II!*/D.FF**% G.V G-:-@*
A{(:(::(fHX(fZe(f(f(f%(=(E(EaAq(Q(Q%!m%55!*/D.FF*  9#}QRR'88# 	nmm]4EGZ$[mmmmmm%++*#=	
 
 
 	
r$   )NFFFFT)r   r   r   rE   r   r~   r"   r   r   r    r   r   r   r}   r   r   s   @r%   r_  r_    s        , , , , ,4 26,1/4CH+0&*A
 A
|A
  S/A
 E-.	A

 $D>A
 'tnA
 3;4.A
 #4.A
 d^A
 
u,,	-A
 A
 A
 A
 A
 A
 A
 A
r$   r_  c                   2    e Zd ZU eed<   dZdZdZdgZd Z	dS )DonutSwinPreTrainedModelrW   donutrq   TrJ  c                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS t          |t                    rN|j        |j        j        
                                 |j         |j        j        
                                 dS dS t          |t                     r |j        j        
                                 dS dS )zInitialize the weightsr   )meanstdNrt   )r   r   r   r   weightdatanormal_rW   initializer_ranger   zero_rQ   fill_rB   rN   rP   r   r   )rX   modules     r%   _init_weightsz&DonutSwinPreTrainedModel._init_weightsB  sW   fry")455 	= M&&CT[5R&SSS{& &&((((( '&-- 		=K""$$$M$$S))))) 344 	= ,!&,,...)5*/5577777 65 677 	=/4::<<<<<	= 	=r$   N)
r   r   r   r   r!   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr  r#   r$   r%   ry  ry  9  sM          $O&*#)*= = = = =r$   ry  c                        e Zd Zd fd	Zd Zd Ze	 	 	 	 	 	 	 ddeej	                 deej
                 d	eej	                 d
ee         dee         dedee         deeef         fd            Z xZS )DonutSwinModelTFc                    t                                          |           || _        t          |j                  | _        t          |j        d| j        dz
  z  z            | _        t          ||          | _
        t          || j
        j                  | _        |rt          j        d          nd| _        |                                  dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        r.   r   )rY   N)rD   rE   rW   r  rf  rg  r   rM   num_featuresrB   r[   r_  rJ   encoderr   AdaptiveAvgPool1dpooler	post_init)rX   rW   add_pooling_layerrY   rZ   s       r%   rE   zDonutSwinModel.__init__X  s     	   fm,, 0119L3M MNN-f^TTT'0JKK1BLb*1--- 	r$   c                     | j         j        S r   )r[   rG   r   s    r%   get_input_embeddingsz#DonutSwinModel.get_input_embeddingsl  s    //r$   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  layerr  r  )rX   heads_to_pruner  r  s       r%   _prune_headszDonutSwinModel._prune_headso  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr$   Nrq   rr   r   r   rm  rp   ro  r\   c                 T   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     |t          | j         j                            }|                     |||          \  }}	| 	                    ||	||||          }
|
d         }d}| j
        >| 
                    |                    dd                    }t          j        |d          }|s||f|
dd         z   }|S t          |||
j        |
j        |
j                  S )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rr   rp   )r   r   rm  ro  r   r   r.   )r   r(   r   r   r   )rW   r   rm  use_return_dictr   get_head_maskr  rf  r[   r  r  r   r   r   r'   r   r   r   )rX   rq   rr   r   r   rm  rp   ro  embedding_outputr   encoder_outputssequence_outputpooled_outputr   s                 r%   r}   zDonutSwinModel.forwardw  su    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@ &&y#dk6H2I2IJJ	-1__/Tl .= .
 .
** ,,/!5# ' 
 
 *!,;" KK(A(A!Q(G(GHHM!M-;;M 	%}58KKFM#-')7&1#2#I
 
 
 	
r$   )TFNNNNNFN)r   r   r   rE   r  r  r   r   r   r    r   r   r   r"   r'   r}   r   r   s   @r%   r  r  V  s            (0 0 0C C C  596:15,0/3).&*=
 =
u01=
 "%"23=
 E-.	=

 $D>=
 'tn=
 #'=
 d^=
 
u**	+=
 =
 =
 ^=
 =
 =
 =
 =
r$   r  a  
    DonutSwin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune DonutSwin on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                        e Zd Z fdZe	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee	         dee	         d	e	d
ee	         de
eef         fd            Z xZS )DonutSwinForImageClassificationc                 @   t                                          |           |j        | _        t          |          | _        |j        dk    r$t          j        | j        j        |j                  nt          j                    | _	        | 
                                 d S r$  )rD   rE   
num_labelsr  rz  r   r   r  r  
classifierr  )rX   rW   rZ   s     r%   rE   z(DonutSwinForImageClassification.__init__  s        +#F++
 FLEVYZEZEZBIdj-v/@AAA`b`k`m`m 	
 	r$   NFrq   r   labelsr   rm  rp   ro  r\   c                 L   ||n| j         j        }|                     ||||||          }|d         }	|                     |	          }
d}||                     ||
| j                   }|s|
f|dd         z   }||f|z   n|S t          ||
|j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   rm  rp   ro  r   r.   )r+   r,   r   r   r   )	rW   r  rz  r  loss_functionr*   r   r   r   )rX   rq   r   r  r   rm  rp   ro  r   r  r,   r+   r   s                r%   r}   z'DonutSwinForImageClassification.forward  s    " &1%<kk$+B]**/!5%=#  
 
  
//%%ffdkBBD 	FY,F)-)9TGf$$vE-!/)#*#A
 
 
 	
r$   r  )r   r   r   rE   r   r   r   r    
LongTensorr   r   r"   r*   r}   r   r   s   @r%   r  r    s               5915-1,0/3).&*-
 -
u01-
 E-.-
 )*	-

 $D>-
 'tn-
 #'-
 d^-
 
u44	5-
 -
 -
 ^-
 -
 -
 -
 -
r$   r  )r  ry  r  )r   F)9r   collections.abcr   r   dataclassesr   typingr   r   r   r   activationsr   modeling_layersr	   modeling_utilsr
   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_donut_swinr   
get_loggerr   loggerr   r'   r*   r>   r@   r   rB   rF   r   r~   r   r   r   r   r   r   r   r  r  r  rJ  r_  ry  r  r  __all__r#   r$   r%   <module>r     s   
      ! ! ! ! ! ! " " " " " " " "        ! ! ! ! ! ! 9 9 9 9 9 9 - - - - - - [ [ [ [ [ [ [ [ [ [ D D D D D D D D D D D D 5 5 5 5 5 5 
	H	%	%   K K K K K[ K K  K    K K K K K; K K  K&   K K K K K[ K K  K,	 	 	  Y- Y- Y- Y- Y-") Y- Y- Y-z(- (- (- (- (-ry (- (- (-X3 3 3 3 3BI 3 3 3n U\ e T V[Vb    *% % % % %	 % % %\ \ \ \ \RY \ \ \@
 
 
 
 
") 
 
 
# # # # # # # #N    BI    	 	 	 	 	bi 	 	 	z z z z zRY z z z|9 9 9 9 9/ 9 9 9zX
 X
 X
 X
 X
ry X
 X
 X
v = = = = = = = =6 ^
 ^
 ^
 ^
 ^
- ^
 ^
 ^
B   =
 =
 =
 =
 =
&> =
 =
 =
@ \
[
[r$   