
     `i>                     j   d Z ddlZddlZddlmZmZ ddlZddlmZ ddl	mc m
Z ddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddl m!Z!  ej"        e#          Z$ G d dej%                  Z& G d dej%                  Z'e G d de                      Z( G d dej%                  Z) G d dej%                  Z* G d dej%                  Z+ G d de          Z, G d dej%                  Z- G d dej%                  Z. G d  d!ej%                  Z/e G d" d#e(                      Z0 G d$ d%ej%                  Z1 ed&'           G d( d)e(                      Z2e G d* d+e(                      Z3 ed,'           G d- d.e(                      Z4g d/Z5dS )0zPyTorch LayoutLMv3 model.    N)OptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging	torch_int   )LayoutLMv3Configc                   *     e Zd ZdZ fdZddZ xZS )LayoutLMv3PatchEmbeddingszLayoutLMv3 image (patch) embeddings. This class also automatically interpolates the position embeddings for varying
    image sizes.c                    t                                                       t          |j        t          j        j                  r|j        n|j        |j        f}t          |j        t          j        j                  r|j        n|j        |j        f}|d         |d         z  |d         |d         z  f| _        t          j
        |j        |j        ||          | _        d S )Nr   r   )kernel_sizestride)super__init__
isinstance
input_sizecollectionsabcIterable
patch_sizepatch_shapennConv2dnum_channelshidden_sizeproj)selfconfig
image_sizer"   	__class__s       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.pyr   z"LayoutLMv3PatchEmbeddings.__init__3   s     &+[_-EFF8F#V%67 	 &+[_-EFF8F#V%67 	
 'qMZ]:JqMZXY]<Z[If163ES]fpqqq			    Nc                    |                      |          }|~|                    d| j        d         | j        d         d          }|                    dddd          }|j        d         |j        d         }}t          j        |||fd          }||z   }|                    d                              dd          }|S )Nr   r   r      bicubic)sizemode)	r(   viewr#   permuteshapeFinterpolateflatten	transpose)r)   pixel_valuesposition_embedding
embeddingspatch_heightpatch_widths         r-   forwardz!LayoutLMv3PatchEmbeddings.forwardC   s    YY|,,
)!3!8!8D<LQ<OQUQabcQdfh!i!i!3!;!;Aq!Q!G!G(2(8(;Z=Ma=P+L!"/AWbHcjs!t!t!t#&88J''**44Q::
r.   N__name__
__module____qualname____doc__r   rA   __classcell__r,   s   @r-   r   r   /   s[         r r r r r        r.   r   c                   F     e Zd ZdZ fdZd Zd Zd Z	 	 	 	 	 ddZ xZ	S )	LayoutLMv3TextEmbeddingszm
    LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
    c                 ~   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j
        |j        |j                  | _
        t          j        |j                  | _        |                     dt!          j        |j                                      d          d           |j        | _        t          j        |j        |j        | j                  | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )N)padding_idxepsposition_ids)r   r0   F)
persistent)r   r   r$   	Embedding
vocab_sizer'   pad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangemax_position_embeddingsexpandrM   position_embeddingsmax_2d_position_embeddingscoordinate_sizex_position_embeddingsy_position_embeddings
shape_sizeh_position_embeddingsw_position_embeddingsr)   r*   r,   s     r-   r   z!LayoutLMv3TextEmbeddings.__init__W   ss   !|F,=v?Q_e_rsss%'\&2H&J\%]%]"f&8f>STTTz&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 ".#%<*F,>DL\$
 $
 $
  &(\&2SU[Uk%l%l"%'\&2SU[Uk%l%l"%'\&2SU[Uf%g%g"%'\&2SU[Uf%g%g"""r.   c           	         	 |                      |d d d d df                   }|                     |d d d d df                   }|                      |d d d d df                   }|                     |d d d d df                   }n"# t          $ r}t          d          |d }~ww xY w|                     t	          j        |d d d d df         |d d d d df         z
  dd                    }|                     t	          j        |d d d d df         |d d d d df         z
  dd                    }t	          j        ||||||gd          }	|	S )	Nr   r   r1   r   z;The `bbox` coordinate values should be within 0-1000 range.i  r0   dim)re   rf   
IndexErrorrh   r^   clipri   cat)
r)   bboxleft_position_embeddingsupper_position_embeddingsright_position_embeddingslower_position_embeddingserh   ri   spatial_position_embeddingss
             r-   %calculate_spatial_position_embeddingsz>LayoutLMv3TextEmbeddings.calculate_spatial_position_embeddingsn   s   	c'+'A'A$qqq!!!Qw-'P'P$(,(B(B4111a=(Q(Q%(,(B(B4111a=(Q(Q%(,(B(B4111a=(Q(Q%% 	c 	c 	cZ[[abb	c !% : :5:d111aaaQR7mVZ[\[\[\^_^_^_ab[bVcFcefhl;m;m n n $ : :5:d111aaaQR7mVZ[\[\[\^_^_^_ab[bVcFcefhl;m;m n n ',i()))%% 
'
 
'
 
'
# +*s   BB 
B*B%%B*c                     |                     |                                          }t          j        |d                              |          |z  }|                                |z   S )z
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
        r   rl   )neintr^   cumsumtype_aslong)r)   	input_idsrM   maskincremental_indicess        r-   "create_position_ids_from_input_idsz;LayoutLMv3TextEmbeddings.create_position_ids_from_input_ids   s`     ||K((,,..$|Da888@@FF$N"''))K77r.   c                    |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |          S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
        Nr0   r   dtypedevicer   )r3   r^   r_   rM   r~   r   	unsqueezera   )r)   inputs_embedsinput_shapesequence_lengthrP   s        r-   &create_position_ids_from_inputs_embedsz?LayoutLMv3TextEmbeddings.create_position_ids_from_inputs_embeds   s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<<r.   Nc                 ~   |K|4|                      || j                                      |j                  }n|                     |          }||                                }n|                                d d         }|+t          j        |t          j        | j	        j                  }|| 
                    |          }|                     |          }||z   }|                     |          }	||	z  }|                     |          }
||
z   }|                     |          }|                     |          }|S )Nr0   r   )r   rM   tor   r   r3   r^   zerosr~   rP   rU   rW   rb   rx   rX   r\   )r)   r   rq   token_type_idsrP   r   r   rW   r>   rb   rw   s              r-   rA   z LayoutLMv3TextEmbeddings.forward   sJ    $#FFyRVRbccff$     $JJ=YY #..**KK',,..ss3K!"[EJtO`OghhhN  00;;M $ : :> J J"%::
"66|DD))
&*&P&PQU&V&V#"==
^^J//
\\*--
r.   )NNNNN)
rD   rE   rF   rG   r   rx   r   r   rA   rH   rI   s   @r-   rK   rK   R   s         h h h h h.+ + +48 8 8
= 
= 
= ' ' ' ' ' ' ' 'r.   rK   c                   $    e Zd ZU eed<   dZd ZdS )LayoutLMv3PreTrainedModelr*   
layoutlmv3c                 `   t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS t          |t                    rJ| j        j        r@|j        j        
                                 |j        j        
                                 dS dS dS )zInitialize the weights        )meanstdNg      ?)r   r$   Linearr%   weightdatanormal_r*   initializer_rangebiaszero_rR   rM   rX   fill_LayoutLMv3Modelvisual_embed	cls_token	pos_embed)r)   modules     r-   _init_weightsz'LayoutLMv3PreTrainedModel._init_weights   s   fry")455 	. M&&CT[5R&SSS{& &&((((( '&-- 
	.M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	.K""$$$M$$S)))))00 	.{' . %++--- %++-----	. 	.. .r.   N)rD   rE   rF   r   __annotations__base_model_prefixr    r.   r-   r   r      s7         $. . . . .r.   r   c                   8     e Zd Z fdZddZ	 	 	 	 	 ddZ xZS )	LayoutLMv3SelfAttentionc                    t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        |j        | _        |j        | _        d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())r   r   r'   num_attention_headshasattr
ValueErrorr{   attention_head_sizeall_head_sizer$   r   querykeyvaluerZ   attention_probs_dropout_probr\   has_relative_attention_biashas_spatial_attention_biasrj   s     r-   r   z LayoutLMv3SelfAttention.__init__   s1    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF+1+M(*0*K'''r.       c                     ||z  }|                     d                              d          }||z
  |z  } t          j        d          |          S )a  
        https://huggingface.co/papers/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
        (PB-Relax). A replacement of the original nn.Softmax(dim=-1)(attention_scores). Seems the new attention_probs
        will result in a slower speed and a little bias. Can use torch.allclose(standard_attention_probs,
        cogview_attention_probs, atol=1e-08) for comparison. The smaller atol (e.g., 1e-08), the better.
        r0   rl   )amaxr   r$   Softmax)r)   attention_scoresalphascaled_attention_scores	max_valuenew_attention_scoress         r-   cogview_attentionz)LayoutLMv3SelfAttention.cogview_attention   sa     #3U":+00b0::DDRHH	 7) CuL!rzb!!!"6777r.   NFc                 d   |j         \  }}}	|                     |                              |d| j        | j                                      dd          }
|                     |                              |d| j        | j                                      dd          }|                     |                              |d| j        | j                                      dd          }t          j	        |
t          j        | j                  z  |                    dd                    }| j        r*| j        r#|||z   t          j        | j                  z  z  }n&| j        r||t          j        | j                  z  z  }|||z   }|                     |          }|                     |          }|||z  }t          j	        ||          }|                    dddd                                          }|                                d d         | j        fz   } |j        | }|r||fn|f}|S )Nr0   r   r1   r   r   )r7   r   r5   r   r   r;   r   r   r^   matmulmathsqrtr   r   r   r\   r6   
contiguousr3   r   )r)   hidden_statesattention_mask	head_maskoutput_attentionsrel_pos
rel_2d_pos
batch_size
seq_length_query_layer	key_layervalue_layerr   attention_probscontext_layernew_context_layer_shapeoutputss                     r-   rA   zLayoutLMv3SelfAttention.forward  s7    %2$7!
JJJ}%%T*b$":D<TUUYq!__ 	 HH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	 !<di@X6Y6Y(Y[d[n[noqsu[v[vww+ 	N0O 	N:!54C[9\9\ \\- 	N$)D4L*M*M MM%/.@ 001ABB ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD6G]=/22mM]r.   )r   NNFNN)rD   rE   rF   r   r   rA   rH   rI   s   @r-   r   r      sv        L L L L L(
8 
8 
8 
8 < < < < < < < <r.   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )LayoutLMv3SelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S NrN   )r   r   r$   r   r'   denserX   rY   rZ   r[   r\   rj   s     r-   r   zLayoutLMv3SelfOutput.__init__D  sf    Yv163EFF
f&8f>STTTz&"<==r.   r   input_tensorreturnc                     |                      |          }|                     |          }|                     ||z             }|S rB   r   r\   rX   r)   r   r   s      r-   rA   zLayoutLMv3SelfOutput.forwardJ  @    

=11]33}|'CDDr.   rD   rE   rF   r   r^   TensorrA   rH   rI   s   @r-   r   r   C  i        > > > > >U\  RWR^        r.   r   c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )LayoutLMv3Attentionc                     t                                                       t          |          | _        t	          |          | _        d S rB   )r   r   r   r)   r   outputrj   s     r-   r   zLayoutLMv3Attention.__init__S  s;    +F33	*622r.   NFc                     |                      ||||||          }|                     |d         |          }|f|dd          z   }	|	S )Nr   r   r   r   )r)   r   )
r)   r   r   r   r   r   r   self_outputsattention_outputr   s
             r-   rA   zLayoutLMv3Attention.forwardX  sc     yy! ! 
 
  ;;|AFF#%QRR(88r.   r   )rD   rE   rF   r   rA   rH   rI   s   @r-   r   r   R  s]        3 3 3 3 3        r.   r   c                   6     e Zd Z fdZ	 	 	 	 	 ddZd Z xZS )LayoutLMv3Layerc                     t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        d S Nr   )
r   r   chunk_size_feed_forwardseq_len_dimr   	attentionLayoutLMv3IntermediateintermediateLayoutLMv3Outputr   rj   s     r-   r   zLayoutLMv3Layer.__init__p  s^    '-'E$,V4426::&v..r.   NFc                     |                      ||||||          }|d         }|dd          }	t          | j        | j        | j        |          }
|
f|	z   }	|	S )N)r   r   r   r   r   )r   r   feed_forward_chunkr   r   )r)   r   r   r   r   r   r   self_attention_outputsr   r   layer_outputs              r-   rA   zLayoutLMv3Layer.forwardx  s     "&/! "0 "
 "
 2!4(,0#T%A4CSUe
 
  /G+r.   c                 \    |                      |          }|                     ||          }|S rB   )r   r   )r)   r   intermediate_outputr   s       r-   r   z"LayoutLMv3Layer.feed_forward_chunk  s2    "//0@AA{{#68HIIr.   r   )rD   rE   rF   r   rA   r   rH   rI   s   @r-   r   r   o  sl        / / / / /    8      r.   r   c                   L     e Zd Z fdZddZd Zd Z	 	 	 	 	 	 	 	 	 dd
Z xZS )LayoutLMv3Encoderc                 h   t                                                       | _        t          j        fdt          j                  D                       | _        d| _        j	        | _	        j
        | _
        | j	        r>j        | _        j        | _        t          j        | j        j        d          | _        | j
        rfj        | _        j        | _        t          j        | j        j        d          | _        t          j        | j        j        d          | _        d S d S )Nc                 .    g | ]}t                    S r   )r   ).0r   r*   s     r-   
<listcomp>z.LayoutLMv3Encoder.__init__.<locals>.<listcomp>  s!    #e#e#eOF$;$;#e#e#er.   F)r   )r   r   r*   r$   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r   rel_pos_binsmax_rel_posr   r   rel_pos_biasmax_rel_2d_posrel_2d_pos_binsrel_pos_x_biasrel_pos_y_biasrj   s    `r-   r   zLayoutLMv3Encoder.__init__  s&   ]#e#e#e#eU6KcEdEd#e#e#eff
&+#+1+M(*0*K'+ 	e & 3D%1D "	$*;V=W^c d d dD* 	j"("7D#)#9D "$)D,@&B\ch"i"i"iD"$)D,@&B\ch"i"i"iD		j 	jr.   Tr      c                 :   d}|r8|dz  }||dk                                     |z  z  }t          j        |          }n(t          j        | t          j        |                    }|dz  }||k     }|t          j        |                                |z            t          j        ||z            z  ||z
  z                      t          j                   z   }	t          j	        |	t          j
        |	|dz
                      }	|t          j        |||	          z  }|S )Nr   r1   r   )r~   r^   absmax
zeros_likelogfloatr   r   min	full_likewhere)
r)   relative_positionbidirectionalnum_bucketsmax_distanceretn	max_exactis_smallval_if_larges
             r-   relative_position_bucketz*LayoutLMv3Encoder.relative_position_bucket  s    	SAK%)//11K??C	+,,AA	,,e.>?P.Q.QRRA  1$	y= !Iaggii)+,,txy8P/Q/QQU`clUlm
"UZ.. yu|[[\_/]/]^^u{8Q555
r.   c                    |                     d          |                     d          z
  }|                     || j        | j                  }t	          j                    5  | j        j                                        |         	                    dddd          }d d d            n# 1 swxY w Y   |
                                }|S )Nr   r0   r  r  r   r   r   r1   )r   r$  r
  r  r^   no_gradr  r   tr6   r   )r)   rP   rel_pos_matr   s       r-   _cal_1d_pos_embz!LayoutLMv3Encoder._cal_1d_pos_emb  s   ",,R00<3I3I"3M3MM//)) 0 
 
 ]__ 	P 	P'.00227;CCAq!QOOG	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P$$&&s   !;B((B,/B,c                    |d d d d df         }|d d d d df         }|                     d          |                     d          z
  }|                     d          |                     d          z
  }|                     || j        | j                  }|                     || j        | j                  }t	          j                    5  | j        j                                        |         	                    dddd          }| j
        j                                        |         	                    dddd          }d d d            n# 1 swxY w Y   |                                }|                                }||z   }|S )Nr   r   r   r0   r&  r   r1   )r   r$  r  r  r^   r'  r  r   r(  r6   r  r   )	r)   rq   position_coord_xposition_coord_yrel_pos_x_2d_matrel_pos_y_2d_mat	rel_pos_x	rel_pos_yr   s	            r-   _cal_2d_pos_embz!LayoutLMv3Encoder._cal_2d_pos_emb  s   111a=111a=+55b99<L<V<VWY<Z<ZZ+55b99<L<V<VWY<Z<ZZ11,, 2 
 
	
 11,, 2 
 
	 ]__ 	V 	V+24466yAII!QPQSTUUI+24466yAII!QPQSTUUI	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V ((**	((**	*
s   A5EEENFc           	         |rdnd }|rdnd }| j         r|                     |          nd }| j        r|                     |          nd }t	          | j                  D ]@\  }}|r||fz   }|||         nd } |||||||          }|d         }|r||d         fz   }A|r||fz   }|st          d |||fD                       S t          |||          S )Nr   r   r   r   c              3      K   | ]}||V  	d S rB   r   )r  vs     r-   	<genexpr>z,LayoutLMv3Encoder.forward.<locals>.<genexpr>  s4        
 =  !=== r.   last_hidden_stater   
attentions)r   r*  r   r2  	enumerater  tupler   )r)   r   rq   r   r   r   output_hidden_statesreturn_dictrP   r?   r@   all_hidden_statesall_self_attentionsr   r   ilayer_modulelayer_head_masklayer_outputss                      r-   rA   zLayoutLMv3Encoder.forward  sy    #7@BBD$5?bb48<8Xb$&&|444^b373R\T))$///X\
(44 	P 	POA|# I$58H$H!.7.CillO(L!%  M *!,M  P&9]1=M<O&O# 	E 1]4D D 		   "%'      ++*
 
 
 	
r.   )Tr   r  )	NNNFFTNNN)	rD   rE   rF   r   r$  r*  r2  rA   rH   rI   s   @r-   r   r     s        j j j j j(   .  "  < "7
 7
 7
 7
 7
 7
 7
 7
r.   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )r   c                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S rB   )r   r   r$   r   r'   intermediate_sizer   r   
hidden_actstrr	   intermediate_act_fnrj   s     r-   r   zLayoutLMv3Intermediate.__init__.  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r.   r   r   c                 Z    |                      |          }|                     |          }|S rB   )r   rI  )r)   r   s     r-   rA   zLayoutLMv3Intermediate.forward6  s,    

=1100??r.   r   rI   s   @r-   r   r   -  s^        9 9 9 9 9U\ el        r.   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )r   c                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )r   r   r$   r   rF  r'   r   rX   rY   rZ   r[   r\   rj   s     r-   r   zLayoutLMv3Output.__init__>  sf    Yv79KLL
f&8f>STTTz&"<==r.   r   r   r   c                     |                      |          }|                     |          }|                     ||z             }|S rB   r   r   s      r-   rA   zLayoutLMv3Output.forwardD  r   r.   r   rI   s   @r-   r   r   =  r   r.   r   c                       e Zd Z fdZd Zd Zd ZddZd Zd	 Z	e
	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd            Z xZS )r   c                 |   t                                          |           || _        |j        rt	          |          | _        |j        rGt          |          | _        t          |j
        |j        z            }t          j        t          j        dd|j                            | _        t          j        t          j        d||z  dz   |j                            | _        t          j        d          | _        t          j        |j        |j                  | _        t          j        |j                  | _        | j        j        s| j        j        r|                     ||f           t          j        |j        d          | _        t;          |          | _        |                                  d S )Nr   r   )prN   )r+   gư>) r   r   r*   
text_embedrK   r>   r   r   patch_embedr{   r   r"   r$   	Parameterr^   r   r'   r   r   rZ   pos_droprX   rY   r[   r\   r   r   init_visual_bboxnormr   encoderinit_weights)r)   r*   r3   r,   s      r-   r   zLayoutLMv3Model.__init__M  sv       	?6v>>DO 	C  9@@Dv(6+<<==D\%+aF<N*O*OPPDN\%+aq&J\*]*]^^DNJ---DM\&*<&BWXXXDN:f&@AADL{6 ?$+:` ?%%$%>>>V%7TBBBDI(00r.   c                     | j         j        S rB   r>   rU   )r)   s    r-   get_input_embeddingsz$LayoutLMv3Model.get_input_embeddingsj  s    ..r.   c                     || j         _        d S rB   rZ  )r)   r   s     r-   set_input_embeddingsz$LayoutLMv3Model.set_input_embeddingsm  s    */'''r.   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrW  r  r   prune_heads)r)   heads_to_pruner  headss       r-   _prune_headszLayoutLMv3Model._prune_headsp  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr.      re    c           	      .   t          j        t          j        d||d         dz   z  |          |d         d          }t          j        t          j        d||d         dz   z  |          |d         d          }t          j        |dd                             |d         d          |dd                             |d         d                              dd          |dd                             |d         d          |dd                             |d         d                              dd          gd                              dd          }t          j        dd|dz
  |dz
  gg          }t          j        ||gd          | _	        dS )	zJ
        Create the bounding boxes for the visual (patch) tokens.
        r   r   trunc)rounding_modeNr0   rl      )
r^   divr_   stackrepeatr;   r5   tensorrp   visual_bbox)r)   r+   max_lenvisual_bbox_xvisual_bbox_yro  cls_token_boxs          r-   rU  z LayoutLMv3Model.init_visual_bboxx  s    	LGz!}q'897CCZPQ]bi
 
 
 	LGz!}q'897CCZPQ]bi
 
 
 kcrc"))*Q-;;crc"))*Q-;;EEaKKabb!((A::abb!((A::DDQJJ	 
 
 
 $r1++ 	 ueWq['A+&N%OPP 9m[%AqIIIr.   c                     | j                             |dd          }|                    |                              |          }|S r   )ro  rm  r   type)r)   r   r   r   ro  s        r-   calculate_visual_bboxz%LayoutLMv3Model.calculate_visual_bbox  sA    &--j!Q??!nnV,,11%88r.   c                 >   |                      |          }|                                \  }}}| j                            |dd          }t	          j        ||fd          }| j        
|| j        z   }|                     |          }|                     |          }|S )Nr0   r   rl   )	rR  r3   r   ra   r^   rp   r   rT  rV  )r)   r<   r>   r   seq_lenr   
cls_tokenss          r-   forward_imagezLayoutLMv3Model.forward_image  s    %%l33
 ",!2!2
GQ^**:r2>>
Y
J7Q???
 >%#dn4J]]:..
YYz**
r.   Nr   rq   r   r   rP   r   r   r<   r   r<  r=  r   c                    |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }|!|                                }|\  }}|j        }nS|)|                                dd         }|\  }}|j        }n(|t          |          }|j        }nt          d          |||t          j	        ||f|          }|!t          j
        |t          j        |          }|?t          j
        t          t          |          dgz             t          j        |          }|                     |||||          }dx}}dx}}| t          |j        d         | j         j        z            t          |j        d	         | j         j        z            }}|                     |          }t          j	        ||j        d
         ft          j        |          }|t          j        ||gd
          }n|}| j         j        s| j         j        r| j         j        r?|                     |t          j        |          }|t          j        ||gd
          }n|}t          j        d|j        d
         t          j        |                              |d
          }||^t          j        d|d
         |                              d          }|                    |          }t          j        ||gd
          }n|}||t          j        ||gd
          }n|}|                     |          }|                     |          }ng| j         j        s| j         j        rO| j         j        r|}| j         j        r5| j        j        ddd|d
         f         }|                    |          }|}|                     |d||j                   }| !                    || j         j"                  }| #                    ||||||	|
|||
  
        }|d         }|s|f|d
d         z   S tI          ||j%        |j&                  S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        bbox (`torch.LongTensor` of shape `(batch_size, token_sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.
        token_type_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, token_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr0   zEYou have to specify either input_ids or inputs_embeds or pixel_values)r   r   rj  )r   rq   rP   r   r   r1   r   r   rl   )r   r   r   )r   )	rq   rP   r   r   r   r<  r=  r?   r@   r7  )'r*   r   r<  use_return_dictr3   r   lenr   r^   onesr   r~   r;  listr>   r   r7   r"   rz  rp   r   r   rv  r_   rm  r   ra   rX   r\   rP   	expand_asget_extended_attention_maskr   get_head_maskr  rW  r   r   r9  )r)   r   rq   r   r   rP   r   r   r<   r   r<  r=  r   r   r   r   embedding_output
final_bboxfinal_position_idsr?   r@   visual_embeddingsvisual_attention_maskro  visual_position_idsextended_attention_maskencoder_outputssequence_outputs                               r-   rA   zLayoutLMv3Model.forward  s   \ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] #..**K%0"J
%FF&',,..ss3K%0"J
")FF%\**J!(FFdeee M$=%!&j*-Ev!V!V!V%!&[
SY!Z!Z!Z|{5k):):aS)@#A#A\bccc##)-+  /     +/.
'%)){#,,Q/$+2HHII,,Q/$+2HHII &L !% 2 2< @ @$)J.4Q78
SY% % %! )!&N<Q+RXY!Z!Z!Z!6{6 =$+:` =;9 1"&"<"<V5:bl"<"m"mK'%*Yk/B%J%J%J

%0
&+l(.q1F' ' '&Q'' $ (M,E#(<;q>&#Q#Q#Q#[#[\]#^#^L#/#6#6{#C#CL).LBU3V\])^)^)^&&)<&$(A#(9.>@Q-RXY#Z#Z#Z  #4 #~~.>??#||,<==[4 	28^ 	2{5 "!
{6 2#;AAA?OQ?O<OP+55i@@%1"040P0PD&0@0F 1Q 1
 1
 &&y$+2OPP	,,+2/!5#%# ' 
 
 *!, 	<#%(;;;-)7&1
 
 
 	
r.   )rd  rf  )NNNNNNNNNNN)rD   rE   rF   r   r[  r]  rc  rU  rv  rz  r   r   r^   
LongTensorFloatTensorboolr   r;  r   rA   rH   rI   s   @r-   r   r   K  s           :/ / /0 0 0C C CJ J J J.  
  "  15+/6:5937155948,0/3&*C
 C
E,-C
 u'(C
 !!23	C

 !!12C
 u/0C
 E-.C
   12C
 u01C
 $D>C
 'tnC
 d^C
 
uo%	&C
 C
 C
 ^C
 C
 C
 C
 C
r.   r   c                   *     e Zd ZdZd fd	Zd Z xZS )LayoutLMv3ClassificationHeadz\
    Head for sentence-level classification tasks. Reference: RobertaClassificationHead
    Fc                    t                                                       || _        |r(t          j        |j        dz  |j                  | _        n$t          j        |j        |j                  | _        |j        |j        n|j        }t          j	        |          | _
        t          j        |j        |j                  | _        d S )Nr   )r   r   pool_featurer$   r   r'   r   classifier_dropoutr[   rZ   r\   
num_labelsout_proj)r)   r*   r  r  r,   s       r-   r   z%LayoutLMv3ClassificationHead.__init__q  s    ( 	K6#5#96;MNNDJJ6#5v7IJJDJ)/)B)NF%%TZTn 	 z"455	&"4f6GHHr.   c                     |                      |          }|                     |          }t          j        |          }|                      |          }|                     |          }|S rB   )r\   r   r^   tanhr  )r)   xs     r-   rA   z$LayoutLMv3ClassificationHead.forward~  sR    LLOOJJqMMJqMMLLOOMM!r.   )FrC   rI   s   @r-   r  r  l  s\         I I I I I I      r.   r  a  
    LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
    for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
    [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
    [Kleister-NDA](https://github.com/applicaai/kleister-nda).
    )custom_introc                   ~    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee	         dee	         dee	         deej                 de
eef         fd            Z xZS ) LayoutLMv3ForTokenClassificationc                 z   t                                          |           |j        | _        t          |          | _        t          j        |j                  | _        |j        dk     r%t          j	        |j
        |j                  | _        nt          |d          | _        |                                  d S )N
   Fr  )r   r   r  r   r   r$   rZ   r[   r\   r   r'   
classifierr  rX  rj   s     r-   r   z)LayoutLMv3ForTokenClassification.__init__  s        +)&11z&"<==r!! i(:F<MNNDOO:6PUVVVDOr.   Nr   rq   r   r   rP   r   r   labelsr   r<  r=  r<   r   c                 V   ||n| j         j        }|                     ||||||||	|
||          }||                                }n|                                dd         }|d         }|d         ddd|f         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )a!  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForTokenClassification
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
        >>> word_labels = example["ner_tags"]

        >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```N)
rq   r   r   rP   r   r   r   r<  r=  r<   r0   r   r   losslogitsr   r9  )r*   r|  r   r3   r\   r  r   r5   r  r   r   r9  )r)   r   rq   r   r   rP   r   r   r  r   r<  r=  r<   r   r   r   r  r  r  loss_fctr   s                        r-   rA   z(LayoutLMv3ForTokenClassification.forward  sg   ^ &1%<kk$+B]//))%'/!5#% " 
 
  #..**KK',,..ss3K ^
!!*QQQ^4,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r.   NNNNNNNNNNNN)rD   rE   rF   r   r   r   r^   r  r  r  r   r;  r   rA   rH   rI   s   @r-   r  r    sx             15+/6:59371559-1,0/3&*37V
 V
E,-V
 u'(V
 !!23	V

 !!12V
 u/0V
 E-.V
   12V
 )*V
 $D>V
 'tnV
 d^V
 u/0V
 
u++	,V
 V
 V
 ^V
 V
 V
 V
 V
r.   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee	         dee	         dee	         deej                 deej                 de
eef         fd            Z xZS )LayoutLMv3ForQuestionAnsweringc                     t                                          |           |j        | _        t          |          | _        t          |d          | _        |                                  d S NFr  )r   r   r  r   r   r  
qa_outputsrX  rj   s     r-   r   z'LayoutLMv3ForQuestionAnswering.__init__  s`        +)&116vERRRr.   Nr   r   r   rP   r   r   start_positionsend_positionsr   r<  r=  rq   r<   r   c                    ||n| j         j        }|                     |||||||	|
|||          }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|dd         z   }||f|z   n|S t          ||||j        |j        	          S )
a  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
        >>> from datasets import load_dataset
        >>> import torch

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
        >>> start_positions = torch.tensor([1])
        >>> end_positions = torch.tensor([3])

        >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
        >>> loss = outputs.loss
        >>> start_scores = outputs.start_logits
        >>> end_scores = outputs.end_logits
        ```N
r   r   rP   r   r   r   r<  r=  rq   r<   r   r   r0   rl   )ignore_indexr1   )r  start_logits
end_logitsr   r9  )r*   r|  r   r  splitsqueezer   r}  r3   clampr   r   r   r9  )r)   r   r   r   rP   r   r   r  r  r   r<  r=  rq   r<   r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                            r-   rA   z&LayoutLMv3ForQuestionAnswering.forward  s   f &1%<kk$+B]//))%'/!5#% " 
 
 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r.   )NNNNNNNNNNNNN)rD   rE   rF   r   r   r   r^   r  r  r  r   r;  r   rA   rH   rI   s   @r-   r  r    s             156:593715596:48,0/3&*+/37d
 d
E,-d
 !!23d
 !!12	d

 u/0d
 E-.d
   12d
 "%"23d
   01d
 $D>d
 'tnd
 d^d
 u'(d
 u/0d
 
u22	3d
 d
 d
 ^d
 d
 d
 d
 d
r.   r  a
  
    LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for document image classification tasks such as the
    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
    c                   ~    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee	         dee	         dee	         deej                 deej                 de
eef         fd            Z xZS )#LayoutLMv3ForSequenceClassificationc                     t                                          |           |j        | _        || _        t	          |          | _        t          |d          | _        |                                  d S r  )	r   r   r  r*   r   r   r  r  rX  rj   s     r-   r   z,LayoutLMv3ForSequenceClassification.__init__r  sg        +)&116vERRRr.   Nr   r   r   rP   r   r   r  r   r<  r=  rq   r<   r   c                    |
|
n| j         j        }
|                     ||||||||	|
||          }|d         dddddf         }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j	        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j        |j        	          S )
a_  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForSequenceClassification
        >>> from datasets import load_dataset
        >>> import torch

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")
        >>> sequence_label = torch.tensor([1])

        >>> outputs = model(**encoding, labels=sequence_label)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr0   r  )r*   r|  r   r  problem_typer  r   r^   r~   r{   r   r  r   r5   r   r   r   r9  )r)   r   r   r   rP   r   r   r  r   r<  r=  rq   r<   r   r  r  r  r  r   s                      r-   rA   z+LayoutLMv3ForSequenceClassification.forward{  s%   \ &1%<kk$+B]//))%'/!5#% " 
 
 "!*QQQ111W-11{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r.   r  )rD   rE   rF   r   r   r   r^   r  r  r  r   r;  r   rA   rH   rI   s   @r-   r  r  j  sx             156:59371559-1,0/3&*+/37`
 `
E,-`
 !!23`
 !!12	`

 u/0`
 E-.`
   12`
 )*`
 $D>`
 'tn`
 d^`
 u'(`
 u/0`
 
u..	/`
 `
 `
 ^`
 `
 `
 `
 `
r.   r  )r  r  r  r   r   )6rG   r   r   typingr   r   r^   torch.nnr$   torch.nn.functional
functionalr8   r   r   r   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_layoutlmv3r   
get_loggerrD   loggerModuler   rK   r   r   r   r   r   r   r   r   r   r  r  r  r  __all__r   r.   r-   <module>r     s           " " " " " " " "                 A A A A A A A A A A ! ! ! ! ! ! 9 9 9 9 9 9            . - - - - - 6 6 6 6 6 6         
 7 6 6 6 6 6 
	H	%	%         	      Fs s s s sry s s sl . . . . . . . .2] ] ] ] ]bi ] ] ]B    29       ")   :( ( ( ( (0 ( ( (VO
 O
 O
 O
 O
	 O
 O
 O
f    RY        ry    ]
 ]
 ]
 ]
 ]
/ ]
 ]
 ]
@	    29   6   e
 e
 e
 e
 e
'@ e
 e
 e
P o
 o
 o
 o
 o
%> o
 o
 o
d   k
 k
 k
 k
 k
*C k
 k
 k
\  r.   