
     `i                        d Z ddlZddlmZ ddlmZmZ ddlZddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  e%j(        e)          Z*da+d Z,d Z-dJdZ.dJdZ/dJdZ0d Z1 G d dej2        j3                  Z4 G d dej2        j3                  Z5 G d d          Z6dKdZ7d Z8	 	 	 dLd Z9 G d! d"ej:                  Z; G d# d$ej:                  Z< G d% d&ej:                  Z= G d' d(ej:                  Z> G d) d*ej:                  Z? G d+ d,ej:                  Z@ G d- d.e          ZA G d/ d0ej:                  ZB G d1 d2ej:                  ZC G d3 d4ej:                  ZD G d5 d6ej:                  ZEe! G d7 d8e                      ZFe! G d9 d:eF                      ZGe! G d; d<eF                      ZH G d= d>ej:                  ZI e!d?@           G dA dBeF                      ZJe! G dC dDeF                      ZKe! G dE dFeF                      ZLe! G dG dHeF                      ZMg dIZNdS )MzPyTorch MRA model.    N)Path)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)load   )ACT2FN)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_cuda_platformis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigc                      t          t                                                    j        j        j        dz  dz  fd}  | g d          }t	          d|d          ad S )Nkernelsmrac                      fd| D             S )Nc                     g | ]}|z  S  r$   ).0file
src_folders     x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mra/modeling_mra.py
<listcomp>z:load_cuda_kernels.<locals>.append_root.<locals>.<listcomp>4   s    444d
T!444    r$   )filesr'   s    r(   append_rootz&load_cuda_kernels.<locals>.append_root3   s    4444e4444r*   )zcuda_kernel.cuzcuda_launch.cuztorch_extension.cppcuda_kernelT)verbose)r   __file__resolveparentr
   mra_cuda_kernel)r,   	src_filesr'   s     @r(   load_cuda_kernelsr4   /   sv    h''))07>JURJ5 5 5 5 5 WWWXXI=)TBBBOOOr*   c                    t          |                                           dk    rt          d          t          |                                          dk    rt          d          |                     d          dk    rt          d          |                     d          dk    rt          d          |                     d	
          j                            dd	          }|                                }|                                }|                                }t          	                    ||||          \  }}|                    dd	          dddddddf         }||fS )z8
    Computes maximum values for softmax stability.
       z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr2   	index_max)sparse_qk_prodindicesquery_num_blockkey_num_block
index_valsmax_valsmax_vals_scatters          r(   
sparse_maxrN   ;   s`    >  !!Q&&IJJJ
7<<>>aBCCC1##YZZZ1##XYYY###++2<<RDDJ&&((JkkmmG  ""G!0!:!::wP_an!o!oH'11"b99!!!QQQaaa-H%%%r*   r9   c                 B   t          |                                           dk    rt          d          t          |                                          dk    rt          d          | j        d         |j        d         k    rt          d          | j        \  }}||z  }t	          j        |                    d          t          j        |j                  }|                     |||          } | |dddf         ||z                                  ddf         } | S )zN
    Converts attention mask to a sparse mask for high resolution logits.
    r7   z$mask must be a 2-dimensional tensor.r8   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r>   r?   r@   shapetorcharangelongrR   reshape)maskrH   
block_size
batch_sizeseq_len	num_block	batch_idxs          r(   sparse_maskr^   W   s    499;;1?@@@
7<<>>aBCCCz!}a(((]^^^*J:%IW\\!__EJw~VVVI<<
Iz::D	!!!T'"Wy%8$>$>$@$@!!!CDDKr*   c                 R   |                                  \  }}}|                                 \  }}}||z  dk    rt          d          ||z  dk    rt          d          |                     |||z  ||                              dd          } |                    |||z  ||                              dd          }t	          |                                            dk    rt          d          t	          |                                           dk    rt          d          t	          |                                           d	k    rt          d
          |                      d          dk    rt          d          |                     d          dk    rt          d          |                                 } |                                }|                                }|                                }t                              | ||                                          S )z7
    Performs Sampled Dense Matrix Multiplication.
    r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r=   r:   r6   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r7   r8   r   r9   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r?   r@   rW   rC   r>   rD   rE   r2   mm_to_sparse)	dense_query	dense_keyrH   rY   rZ   
query_sizer<   _key_sizes	            r(   rb   rb   n   s    #."2"2"4"4J
C ~~''AxJ!##oppp*!!klll%%j*
2JJX[\\ffgikmnnK!!*h*.DjRUVV``aceghhI
;!##FGGG
9>>!!DEEE
7<<>>aBCCCb  IJJJ~~aBGHHH((**K$$&&IkkmmG  ""G''YNNNr*   c                 "   |                                 \  }}}||z  dk    rt          d          |                      d          |k    rt          d          |                      d          |k    rt          d          |                    |||z  ||                              dd          }t	          |                                            d	k    rt          d
          t	          |                                           d	k    rt          d          t	          |                                           dk    rt          d          |                     d          dk    rt          d          |                                 } |                                }|                                }|                                }t                              | |||          }|                    dd                              |||z  |          }|S )zP
    Performs matrix multiplication of a sparse matrix with a dense matrix.
    r   r`   r7   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r=   r:   r6   ,sparse_query must be a 4-dimensional tensor.ra   r8   r9   z8The size of the third dimension of dense_key must be 32.)	r?   r@   rW   rC   r>   rD   rE   r2   sparse_dense_mm)	sparse_queryrH   rd   rI   rY   rZ   rg   r<   dense_qk_prods	            r(   rj   rj      s    !* 0 0J#*!!klllz))lmmmz))klll!!*h*.DjRUVV``aceghhI
<1$$GHHH
9>>!!DEEE
7<<>>aBCCC~~aBSTTT**,,LkkmmG  ""G$$&&I#33L'9VeffM!++B33;;JZdHdfijjMr*   c                 f    | |z  |z  t          j        | |d          z                                   S )Nfloorrounding_mode)rT   divrV   )rH   dim_1_blockdim_2_blocks      r(   transpose_indicesrt      s5    {"k1EIg{bi4j4j4jjpprrrr*   c                   R    e Zd Zed             Zed             Zedd            ZdS )MraSampledDenseMatMulc                 f    t          ||||          }|                     |||           || _        |S N)rb   save_for_backwardrY   )ctxrc   rd   rH   rY   rG   s         r(   forwardzMraSampledDenseMatMul.forward   s:    %k9gzRRk9g>>>#r*   c                 $   | j         \  }}}| j        }|                    d          |z  }|                    d          |z  }t          |||          }t	          |                    dd          |||          }	t	          ||||          }
|
|	d d fS Nr   r=   r:   )saved_tensorsrY   r?   rt   rj   rC   )rz   gradrc   rd   rH   rY   rI   rJ   	indices_Tgrad_key
grad_querys              r(   backwardzMraSampledDenseMatMul.backward   s    *-*;'Y^
%**1--;!q))Z7%gNN	"4>>"b#9#99kS`aa$T7IOO
8T4//r*   r9   c                 <    t                               | |||          S rx   )rv   apply)rc   rd   rH   rY   s       r(   operator_callz#MraSampledDenseMatMul.operator_call   s    $**;	7JWWWr*   Nr9   __name__
__module____qualname__staticmethodr{   r   r   r$   r*   r(   rv   rv      sn          \ 0 0 \0 X X X \X X Xr*   rv   c                   P    e Zd Zed             Zed             Zed             ZdS )MraSparseDenseMatMulc                 f    t          ||||          }|                     |||           || _        |S rx   )rj   ry   rI   )rz   rk   rH   rd   rI   rG   s         r(   r{   zMraSparseDenseMatMul.forward   s;    (w	?[[lGY???-r*   c                    | j         \  }}}| j        }|                    d          |                    d          z  }t          |||          }t	          |                    dd          |||          }t          |||          }	|	d |d fS r}   )r~   rI   r?   rt   rj   rC   rb   )
rz   r   rk   rH   rd   rI   rJ   r   r   r   s
             r(   r   zMraSparseDenseMatMul.backward   s    +.+<(gy-!q))\->->r-B-BB%gNN	"<#9#9"b#A#A9dTabb!$	7;;
44//r*   c                 <    t                               | |||          S rx   )r   r   )rk   rH   rd   rI   s       r(   r   z"MraSparseDenseMatMul.operator_call   s    #)),O\\\r*   Nr   r$   r*   r(   r   r      sh          \ 0 0 \0 ] ] \] ] ]r*   r   c                   $    e Zd Zed             ZdS )MraReduceSumc                    |                                  \  }}}}t          |                                            dk    rt          d          t          |                                           dk    rt          d          |                                  \  }}}}|                                 \  }}|                     d                              ||z  |          } t          j        |                     d          t
          j        |j                  }t          j	        ||d	                                          |d d d f         |z  z                       ||z            }	t          j
        ||z  |f| j        | j                  }
|
                    d|	|                               |||          }|                    |||z            }|S )
Nr6   ri   r7   r8   r;   r   rP   rn   ro   )r?   r>   r@   sumrW   rT   rU   rV   rR   rq   zerosrQ   	index_add)rk   rH   rI   rJ   rZ   r\   rY   rf   r]   global_idxestempoutputs               r(   r   zMraReduceSum.operator_call   s   /;/@/@/B/B,
Iz1|  ""##q((KLLLw||~~!##FGGG*//111j! '
I#''A'..66zI7MzZZLa
7>ZZZ	Ig}GDDDIIKKiXYXYXY[_X_N`crNrr
'*y(
)
) 	 {/):6l>PYeYl
 
 
 <>>FFzSbdnoo
Oj,HIIr*   N)r   r   r   r   r   r$   r*   r(   r   r      s-          \  r*   r   c                    |                                  \  }}}||z  }d}	||                    |||                              d          }
|                     ||||                              d          |
dddddf         dz   z  }|                    ||||                              d          |
dddddf         dz   z  }|?|                    ||||                              d          |
dddddf         dz   z  }	n|t          j        ||t          j        | j                  z  }
|                     ||||                              d          }|                    ||||                              d          }|,|                    ||||                              d          }	t          j        ||	                    dd                    t          j        |          z  }|                    dd          j        }|;|d	|
dddddf         |
dddddf         z  d
k                                     z  z
  }||
||	fS )z/
    Compute low resolution approximation.
    Nr=   r;   r:   ư>rP   T)r<   keepdims     @g      ?)r?   rW   r   rT   onesfloatrR   meanmatmulrC   mathsqrtrA   rB   )querykeyrY   rX   valuerZ   r[   head_dimnum_block_per_row	value_hattoken_count	query_hatkey_hatlow_resolution_logitlow_resolution_logit_row_maxs                  r(   get_low_resolution_logitr     s    %*JJLL!J:-Ill:/@*MMQQVXQYYMM*.?XVVZZ_aZbb111d
#d*
	 ++j*;ZRRVV[]V^^111d
#d*
 j2CZQYZZ^^ce^ffAAAqqq$J'$.I !5:j:KSXS^glgs#t#t#ttMM*.?XVV[[`b[cc	++j*;ZRRWW\^W__j2CZQYZZ__df_ggI <	73D3DR3L3LMMPTPYZbPcPcc#7#;#;T#;#R#R#Y  3;qqq$z+B[QRQRQRTUTUTUW[Q[E\+\`c*c)j)j)l)l#ll 	  .JIUUr*   c                    | j         \  }}}|dk    ra|dz  }t          j        ||| j                  }	t          j        t          j        |	|           |          }
| |
dddddf         dz  z   } |dk    r@| ddd|ddf         dz   | ddd|ddf<   | ddddd|f         dz   | ddddd|f<   t          j        |                     |d          |ddd	
          }|j        }|dk    rD|j	        
                    d          j	        }| |ddddf         k                                    }n|dk    rd}nt          | d          ||fS )zZ
    Compute the indices of the subset of components to be used in the approximation.
    r   r7   rR   )diagonalNg     @r=   TF)r<   largestsortedfullr;   sparsez# is not a valid approx_model value.)rS   rT   r   rR   triltriutopkrW   rH   rB   minr   r@   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrZ   total_blocks_per_rowrf   offset	temp_maskdiagonal_mask
top_k_valsrH   	thresholdhigh_resolution_masks                  r(   get_block_idxesr   6  s    +?*D'J$a&**0A5J35IRfRmnnn	
5:i6'#J#J#JU[\\\3mD!!!QQQJ6ORU6UU#a'' $A%A$A111!DEK 	QQQ =!= =qqq@A !AAA'D(D'D!DEK 	QQQ#@$@#@@A $$Z44jbRV_d  J  Gf%))b)118	 4	!!!T4-8P PWWYY		 	 #KLLLMMM(((r*   c	                    t           &t          j        |                                           S |                                 \  }	}
}}|	|
z  }||z  dk    rt          d          ||z  }|                     |||          } |                    |||          }|                    |||          }|6| |dddddf         z  } ||dddddf         z  }||dddddf         z  }|dk    rt          | ||||          \  }}}}nX|dk    rCt          j                    5  t          | |||          \  }}}}ddd           n# 1 swxY w Y   nt          d          t          j                    5  ||z
  }t          |||||          \  }}ddd           n# 1 swxY w Y   t                              | |||          t          j        |          z  }t          ||||          \  }}||z
  }|)|dd	t!          ||          dddddddf         z
  z  z
  }t          j        |          }t$                              ||||          }t&                              ||||          }|dk    rt          j        ||z
  d|z  z
            |dddddf         z  }t          j        ||          dddddddf                             d	d	|d	                              |||          }|                    d
          dddddf                             d	d	|                              ||          }|                    d	d	|                              ||          |z
  } || |z  } t          j        | | dk                                    z            }!||!dddddf         z  }||!z  }t          j        |  | dk                                    z            }"||"dddddf         z  }||"z  }||z   |dddddf         |dddddf         z   dz   z  }#n+|dk    r||dddddf         dz   z  }#nt          d          ||#|dddddf         z  }#|#                    |	|
||          }#|#S )z0
    Use Mra to approximate self-attention.
    Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rY   r   r   r=   r;   r   z-config.approx_mode must be "full" or "sparse")r2   rT   
zeros_likerequires_grad_r?   r@   rW   r   no_grad	Exceptionr   rv   r   r   r   rN   r^   expr   r   r   repeatr   r   )$r   r   r   rX   r   r   rY   r   r   rZ   num_headr[   r   
meta_batchr   r   r   r   r   rf   low_resolution_logit_normalizedrH   r   high_resolution_logitrL   rM   high_resolution_attnhigh_resolution_attn_outhigh_resolution_normalizerlow_resolution_attnlow_resolution_attn_outlow_resolution_normalizerlog_correctionlow_resolution_corrhigh_resolution_corrcontext_layers$                                       r(   mra2_attentionr   \  s2    &&55777.3jjll+J'8h&Jq  OPPP:-MM*gx88E
++j'8
4
4CMM*gx88EQQQ4Z((DAAAt$$QQQ4Z((fUm3
D%V
 V
Rk+G 
	 	 ]__ 	 	QisJR RN +/KQ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 @AAA	 
 
*>A]*]'(7+(+)
 )
%%
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2??sG
 @  	( ",,A7L]_p!q!qH14DD 5q;tU\C]C]^_^_^_abababdededegk^kCl?l8m m 9%:;;3AAgu.?    ".!;!;g'8:K" " fI*-IICRfLffgg!!!T111*%& 	 L,i88AAAtQQQGVAq*a((WZ(33 	   ###++AAAqqq$J7>>q!ZPPXXYcelmm 	" 6<<Q:NNVVWacjkknvv+d2N#i.A:M9T9T9V9V(VWW"9<OPQPQPQSTSTSTVZPZ<["[$=@S$S!$y.NQ<N;U;U;W;W)WXX#;>RSTSTSTVWVWVWY]S]>^#^ %?BV%V"14KK&qqq!!!Tz25NqqqRSRSRSUYz5ZZ]aa
 
	 	 04NqqqRSRSRSUYz4Z]a4abGHHH%QQQ4Z(88!))*hRRMs$   EE	E0FFFc                   *     e Zd ZdZ fdZddZ xZS )MraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        dz   |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |                     dt%          j        |j                                      d          dz              t+          |dd          | _        |                     dt%          j        | j                                        t$          j        | j        j        	          d
           d S )N)padding_idxr7   epsposition_ids)r   r=   position_embedding_typeabsolutetoken_type_idsrP   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrT   rU   expandgetattrr   r   r   r?   rV   rR   selfconfig	__class__s     r(   r   zMraEmbeddings.__init__  sQ   !|F,=v?Q_e_rsss#%<0NQR0RTZTf#g#g %'\&2H&J\%]%]" f&8f>STTTz&"<== 	^U\&:X-Y-Y-`-`ah-i-ilm-mnnn'.v7PR\']']$K)..00
4K\Kcddd 	 	
 	
 	
 	
 	
r*   Nc                    ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|mt          | d          r2| j        d d d |f         }|                    |d         |          }|}n+t          j        |t
          j        | j        j                  }|| 	                    |          }| 
                    |          }	||	z   }
| j        dk    r|                     |          }|
|z  }
|                     |
          }
|                     |
          }
|
S )Nr=   r   r   r   rP   r   )r?   r   hasattrr   r   rT   r   rV   rR   r   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r(   r{   zMraEmbeddings.forward  sb    #..**KK',,..ss3K ^
,QQQ^<L
 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r*   )NNNNr   r   r   __doc__r   r{   __classcell__r   s   @r(   r   r     sR        QQ
 
 
 
 
(               r*   r   c                   (     e Zd Zd fd	ZddZ xZS )MraSelfAttentionNc                 P   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          t          d u}t                      rbt                      rTt                      rF|sD	 t                       n4# t          $ r'}t                              d|            Y d }~nd }~ww xY w|j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t#          j        |j        | j                  | _        t#          j        |j        | j                  | _        t#          j        |j        | j                  | _        t#          j        |j                  | _        ||n|j        | _        |j        dz  |j        z  | _        t;          | j        t          |j        dz  dz                      | _        |j        | _        |j        | _        |j         | _         d S )	Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: r9   r7   )!r   r   r   num_attention_headsr  r@   r2   r   r   r   r4   r   loggerwarningrE   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr   r   r   block_per_rowr\   r   r   r   r   )r   r   r   kernel_loadeder   s        r(   r   zMraSelfAttention.__init__
  s>    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 (t3"$$ 	n)9);); 	n@R@T@T 	n]j 	nn!#### n n nlijllmmmmmmmmn $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF'>'J##PVPn 	$ !8B>&BVVT^S&2PTV2V[\1\-]-]^^!-,2,O)/5/U,,,s   B* *
C4CCc           
      *   |j         \  }}}|                     |                              |d| j        | j                                      dd          }|                     |                              |d| j        | j                                      dd          }|                     |                              |d| j        | j                                      dd          }d|dz  z   }|                                	                    d| j        d          
                    || j        z  |                                          }d}	| j        |	k     r|| j        ||	| j        z
  f}
t          j        |t          j        |
|j                  gd          }t          j        |t          j        |
|j                  gd          }t          j        |t          j        |
|j                  gd          }t!          |                                |                                |                                |                                | j        | j        | j        | j        	          }| j        |	k     r|d d d d d d d | j        f         }|
                    || j        || j                  }|                    d
ddd                                          }|                                d d         | j        fz   } |j        | }|f}|S )Nr=   r   r7         ?r   r9   r   r;   )r   r   r   r   r   r:   )rS   r   viewr  r  rC   r   r   squeezer   rW   rE   rT   catr   rR   r   r   r\   r   r   r   permuterD   r?   r  )r   hidden_statesattention_maskrZ   r[   rf   query_layer	key_layervalue_layergpu_warp_sizepad_sizer   new_context_layer_shapeoutputss                 r(   r{   zMraSelfAttention.forward-  s   !.!4
GQJJ}%%T*b$":D<TUUYq!__ 	 HH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	 ~77""$$VAt/33WZ$"::GDDSUU	 	 #m33!4#;WmVZVnFnnH)[%+h{Oa2b2b2b$ciklllK	9ek(9K[.\.\.\"]cefffI)[%+h{Oa2b2b2b$ciklllK&OO  ""N()-)J,0,P	
 	
 	
 #m33)!!!QQQ3MT5M3M*MNM%--j$:RT[]a]uvv%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD "r*   rx   r   r   r   r   r{   r  r  s   @r(   r  r  	  sX        !V !V !V !V !V !VF< < < < < < < <r*   r  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )MraSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Nr   )r   r   r   r  r   denser   r   r   r   r   r   s     r(   r   zMraSelfOutput.__init__n  sf    Yv163EFF
f&8f>STTTz&"<==r*   r"  input_tensorreturnc                     |                      |          }|                     |          }|                     ||z             }|S rx   r0  r   r   r   r"  r1  s      r(   r{   zMraSelfOutput.forwardt  @    

=11]33}|'CDDr*   r   r   r   r   rT   Tensorr{   r  r  s   @r(   r-  r-  m  i        > > > > >U\  RWR^        r*   r-  c                   .     e Zd Zd fd	Zd ZddZ xZS )MraAttentionNc                     t                                                       t          ||          | _        t	          |          | _        t                      | _        d S )N)r   )r   r   r  r   r-  r   setpruned_heads)r   r   r   r   s      r(   r   zMraAttention.__init__|  sO    $VE\]]]	#F++EEr*   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r;   )r>   r   r   r  r  r>  r   r   r   r   r   r0  r  union)r   headsindexs      r(   prune_headszMraAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r*   c                     |                      ||          }|                     |d         |          }|f|dd          z   }|S Nr   r   )r   r   )r   r"  r#  self_outputsattention_outputr*  s         r(   r{   zMraAttention.forward  sH    yy??;;|AFF#%QRR(88r*   rx   )r   r   r   r   rC  r{   r  r  s   @r(   r;  r;  {  s`        " " " " " "; ; ;$       r*   r;  c                   B     e Zd Z fdZdej        dej        fdZ xZS )MraIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S rx   )r   r   r   r  r   intermediate_sizer0  
isinstance
hidden_actstrr   intermediate_act_fnr   s     r(   r   zMraIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r*   r"  r2  c                 Z    |                      |          }|                     |          }|S rx   )r0  rO  r   r"  s     r(   r{   zMraIntermediate.forward  s,    

=1100??r*   r7  r  s   @r(   rI  rI    s^        9 9 9 9 9U\ el        r*   rI  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )	MraOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r/  )r   r   r   r  rK  r   r0  r   r   r   r   r   r   s     r(   r   zMraOutput.__init__  sf    Yv79KLL
f&8f>STTTz&"<==r*   r"  r1  r2  c                     |                      |          }|                     |          }|                     ||z             }|S rx   r4  r5  s      r(   r{   zMraOutput.forward  r6  r*   r7  r  s   @r(   rS  rS    r9  r*   rS  c                   ,     e Zd Z fdZddZd Z xZS )MraLayerc                     t                                                       |j        | _        d| _        t	          |          | _        |j        | _        t          |          | _        t          |          | _
        d S Nr   )r   r   chunk_size_feed_forwardseq_len_dimr;  	attentionadd_cross_attentionrI  intermediaterS  r   r   s     r(   r   zMraLayer.__init__  si    '-'E$%f--#)#= +F33''r*   Nc                     |                      ||          }|d         }|dd          }t          | j        | j        | j        |          }|f|z   }|S rE  )r\  r   feed_forward_chunkrZ  r[  )r   r"  r#  self_attention_outputsrG  r*  layer_outputs          r(   r{   zMraLayer.forward  sd    !%~!N!N1!4(,0#T%A4CSUe
 
  /G+r*   c                 \    |                      |          }|                     ||          }|S rx   )r^  r   )r   rG  intermediate_outputrb  s       r(   r`  zMraLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr*   rx   )r   r   r   r   r{   r`  r  r  s   @r(   rW  rW    s[        ( ( ( ( (         r*   rW  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )
MraEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r$   )rW  )r%   rf   r   s     r(   r)   z'MraEncoder.__init__.<locals>.<listcomp>  s!    #^#^#^HV$4$4#^#^#^r*   F)	r   r   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r(   r   zMraEncoder.__init__  s`    ]#^#^#^#^eFD\>]>]#^#^#^__
&+###r*   NFTc                     |rdnd }t          | j                  D ]!\  }}|r||fz   } |||          }	|	d         }"|r||fz   }|st          d ||fD                       S t          ||          S )Nr$   r   c              3      K   | ]}||V  	d S rx   r$   )r%   vs     r(   	<genexpr>z%MraEncoder.forward.<locals>.<genexpr>  s"      XXq!-----XXr*   )last_hidden_stater"  )	enumeraterl  tupler   )
r   r"  r#  	head_maskoutput_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputss
             r(   r{   zMraEncoder.forward  s     #7@BBD(44 	- 	-OA|# I$58H$H!(LGGM)!,MM 	E 1]4D D 	YXX]4E$FXXXXXX1++
 
 
 	
r*   )NNFTr+  r  s   @r(   rf  rf    sZ        , , , , , "
 
 
 
 
 
 
 
r*   rf  c                   B     e Zd Z fdZdej        dej        fdZ xZS )MraPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S r/  )r   r   r   r  r   r0  rL  rM  rN  r   transform_act_fnr   r   r   s     r(   r   z#MraPredictionHeadTransform.__init__  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTr*   r"  r2  c                     |                      |          }|                     |          }|                     |          }|S rx   )r0  r  r   rQ  s     r(   r{   z"MraPredictionHeadTransform.forward  s=    

=11--m<<}55r*   r7  r  s   @r(   r}  r}    sc        U U U U UU\ el        r*   r}  c                   *     e Zd Z fdZd Zd Z xZS )MraLMPredictionHeadc                 >   t                                                       t          |          | _        t	          j        |j        |j        d          | _        t	          j	        t          j        |j                            | _        | j        | j        _        d S )NF)bias)r   r   r}  	transformr   r  r   r   decoder	ParameterrT   r   r  r   s     r(   r   zMraLMPredictionHead.__init__  sz    3F;; y!3V5FUSSSLV->!?!?@@	 !Ir*   c                 (    | j         | j        _         d S rx   )r  r  r   s    r(   _tie_weightsz MraLMPredictionHead._tie_weights  s     Ir*   c                 Z    |                      |          }|                     |          }|S rx   )r  r  rQ  s     r(   r{   zMraLMPredictionHead.forward  s*    }55]33r*   )r   r   r   r   r  r{   r  r  s   @r(   r  r    sV        & & & & && & &      r*   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )MraOnlyMLMHeadc                 p    t                                                       t          |          | _        d S rx   )r   r   r  predictionsr   s     r(   r   zMraOnlyMLMHead.__init__'  s/    .v66r*   sequence_outputr2  c                 0    |                      |          }|S rx   )r  )r   r  prediction_scoress      r(   r{   zMraOnlyMLMHead.forward+  s     ,,_==  r*   r7  r  s   @r(   r  r  &  s^        7 7 7 7 7!u| ! ! ! ! ! ! ! ! !r*   r  c                   8    e Zd ZU eed<   dZdZdej        fdZ	dS )MraPreTrainedModelr   r!   Tmodulec                    | j         j        }t          |t          j                  rJ|j        j                            d|           |j         |j        j        	                                 dS dS t          |t          j
                  rU|j        j                            d|           |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS t          |t                    r |j        j        	                                 dS dS )zInitialize the weightsg        )r   stdNr  )r   initializer_rangerL  r   r  weightdatanormal_r  zero_r   r   r   fill_r  )r   r  r  s      r(   _init_weightsz MraPreTrainedModel._init_weights7  sY   k+fbi(( 	% M&&CS&999{& &&((((( '&-- 	%M&&CS&999!-"6#56<<>>>>> .--- 	%K""$$$M$$S))))) 344 	%K""$$$$$	% 	%r*   N)
r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   Moduler  r$   r*   r(   r  r  0  sM          &*#%BI % % % % % %r*   r  c                   *    e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee	j
                 dee         dee         deeef         fd            Z xZS )MraModelc                     t                                          |           || _        t          |          | _        t          |          | _        |                                  d S rx   )r   r   r   r   r  rf  encoder	post_initr   s     r(   r   zMraModel.__init__M  sX       '//!&)) 	r*   c                     | j         j        S rx   r  r   r  s    r(   get_input_embeddingszMraModel.get_input_embeddingsW  s    ..r*   c                     || j         _        d S rx   r  )r   r   s     r(   set_input_embeddingszMraModel.set_input_embeddingsZ  s    */'''r*   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  rl  r\  rC  )r   heads_to_prunerl  rA  s       r(   _prune_headszMraModel._prune_heads]  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr*   Nr  r#  r   r   ru  r  rv  rw  r2  c	                    ||n| j         j        }||n| j         j        }||t          d          |+|                     ||           |                                }	n.||                                d d         }	nt          d          |	\  }
}||j        n|j        }|t          j        |
|f|          }|gt          | j
        d          r1| j
        j        d d d |f         }|                    |
|          }|}n!t          j        |	t          j        |          }|                     ||	          }|                     || j         j                  }| 
                    ||||          }|                     |||||          }|d	         }|s|f|d
d          z   S t'          ||j        |j        |j                  S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer=   z5You have to specify either input_ids or inputs_embedsr   r   rP   )r  r   r   r  )r#  ru  rv  rw  r   r   )rr  r"  
attentionscross_attentions)r   rv  use_return_dictr@   %warn_if_padding_and_no_attention_maskr?   rR   rT   r   r  r  r   r   r   rV   get_extended_attention_maskget_head_maskrk  r  r   r"  r  r  )r   r  r#  r   r   ru  r  rv  rw  r  rZ   r  rR   r  r  extended_attention_maskembedding_outputencoder_outputsr  s                      r(   r{   zMraModel.forwarde  s2    %9$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"Z*j)A6RRRN!t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z 150P0PQ_al0m0m &&y$+2OPP	??%)'	 + 
 
 ,,2!5# ' 
 
 *!, 	<#%(;;;1-)7&1,=	
 
 
 	
r*   )NNNNNNNN)r   r   r   r   r  r  r  r   r   rT   r8  boolr   rt  r   r{   r  r  s   @r(   r  r  K  sN           / / /0 0 0C C C  -11515/3,004/3&*J
 J
EL)J
 !.J
 !.	J

 u|,J
 EL)J
  -J
 'tnJ
 d^J
 
u88	9J
 J
 J
 ^J
 J
 J
 J
 J
r*   r  c                   H    e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee	j
                 dee	j
                 dee	j
                 dee         dee         deeef         fd            Z xZS )MraForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S rx   )r   r   r  r!   r  clsr  r   s     r(   r   zMraForMaskedLM.__init__  sQ       F##!&)) 	r*   c                 $    | j         j        j        S rx   )r  r  r  r  s    r(   get_output_embeddingsz$MraForMaskedLM.get_output_embeddings  s    x#++r*   c                 T    || j         j        _        |j        | j         j        _        d S rx   )r  r  r  r  )r   new_embeddingss     r(   set_output_embeddingsz$MraForMaskedLM.set_output_embeddings  s%    '5$$2$7!!!r*   Nr  r#  r   r   ru  r  labelsrv  rw  r2  c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }d}|Kt	                      } ||                    d| j         j                  |                    d                    }|	s|f|
dd         z   }||f|z   n|S t          |||
j        |
j	                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr#  r   r   ru  r  rv  rw  r   r=   r   losslogitsr"  r  )
r   r  r!   r  r   r  r   r   r"  r  )r   r  r#  r   r   ru  r  r  rv  rw  r*  r  r  masked_lm_lossloss_fctr   s                   r(   r{   zMraForMaskedLM.forward  s   & &1%<kk$+B](())%'!5#  	
 	
 "!* HH_55'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
r*   	NNNNNNNNN)r   r   r   _tied_weights_keysr   r  r  r   r   rT   r8  r  r   rt  r   r{   r  r  s   @r(   r  r    sI       :<Z[    , , ,8 8 8  -11515/3,004)-/3&*0
 0
EL)0
 !.0
 !.	0

 u|,0
 EL)0
  -0
 &0
 'tn0
 d^0
 
un$	%0
 0
 0
 ^0
 0
 0
 0
 0
r*   r  c                   (     e Zd ZdZ fdZd Z xZS )MraClassificationHeadz-Head for sentence-level classification tasks.c                 "   t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        t          j        |j        |j	                  | _
        || _        d S rx   )r   r   r   r  r   r0  r   r   r   
num_labelsout_projr   r   s     r(   r   zMraClassificationHead.__init__  sj    Yv163EFF
z&"<==	&"4f6GHHr*   c                 
   |d d dd d f         }|                      |          }|                     |          }t          | j        j                 |          }|                      |          }|                     |          }|S )Nr   )r   r0  r   r   rM  r  )r   featureskwargsxs       r(   r{   zMraClassificationHead.forward  st    QQQ111WLLOOJJqMM4;)*1--LLOOMM!r*   r	  r  s   @r(   r  r    sM        77          r*   r  z
    MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                   4    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         de	e
ef         fd            Z xZS )MraForSequenceClassificationc                     t                                          |           |j        | _        t          |          | _        t          |          | _        |                                  d S rx   )r   r   r  r  r!   r  
classifierr  r   s     r(   r   z%MraForSequenceClassification.__init__  s[        +F##/77 	r*   Nr  r#  r   r   ru  r  r  rv  rw  r2  c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j	        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|	s|f|
dd         z   }||f|z   n|S t          |||
j        |
j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr=   r  )r   r  r!   r  problem_typer  rQ   rT   rV   rE   r	   r  r   r  r   r   r"  r  )r   r  r#  r   r   ru  r  r  rv  rw  r*  r  r  r  r  r   s                   r(   r{   z$MraForSequenceClassification.forward!  s   & &1%<kk$+B](())%'!5#  	
 	
 "!*11{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r*   r  )r   r   r   r   r   r   rT   r8  r  r   rt  r   r{   r  r  s   @r(   r  r    s3             -11515/3,004)-/3&*A
 A
EL)A
 !.A
 !.	A

 u|,A
 EL)A
  -A
 &A
 'tnA
 d^A
 
u..	/A
 A
 A
 ^A
 A
 A
 A
 A
r*   r  c                   4    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         de	e
ef         fd            Z xZS )MraForMultipleChoicec                     t                                          |           t          |          | _        t	          j        |j        |j                  | _        t	          j        |j        d          | _        | 	                                 d S rY  )
r   r   r  r!   r   r  r   pre_classifierr  r  r   s     r(   r   zMraForMultipleChoice.__init__h  sr       F## i(:F<NOO)F$6:: 	r*   Nr  r#  r   r   ru  r  r  rv  rw  r2  c
           
         |	|	n| j         j        }	||j        d         n|j        d         }
|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	          }|d         }|dddf         }|                     |          } t          j                    |          }| 	                    |          }|                    d|
          }d}|t                      } |||          }|	s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r=   r:   r  r   r  )r   r  rS   r  r?   r!   r  r   ReLUr  r   r   r"  r  )r   r  r#  r   r   ru  r  r  rv  rw  num_choicesr*  hidden_statepooled_outputr  reshaped_logitsr  r  r   s                      r(   r{   zMraForMultipleChoice.forwardr  sQ   V &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqM[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 (())%'!5#  	
 	
 qz$QQQT*++M::!		-00// ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
r*   r  )r   r   r   r   r   r   rT   r8  r  r   rt  r   r{   r  r  s   @r(   r  r  f  s3             -11515/3,004)-/3&*X
 X
EL)X
 !.X
 !.	X

 u|,X
 EL)X
  -X
 &X
 'tnX
 d^X
 
u//	0X
 X
 X
 ^X
 X
 X
 X
 X
r*   r  c                   4    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         de	e
ef         fd            Z xZS )MraForTokenClassificationc                 6   t                                          |           |j        | _        t          |          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S rx   )r   r   r  r  r!   r   r   r   r   r  r   r  r  r   s     r(   r   z"MraForTokenClassification.__init__  sy        +F##z&"<==)F$68IJJ 	r*   Nr  r#  r   r   ru  r  r  rv  rw  r2  c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }|                     |          }d}|t                      }||                    d          dk    }|                    d| j                  }t          j	        ||                    d          t          j
        |j                                      |                    } |||          }n8 ||                    d| j                  |                    d                    }|	s|f|
dd         z   }||f|z   n|S t          |||
j        |
j                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r=   r   r  )r   r  r!   r   r  r   r  r  rT   wheretensorignore_indextype_asr   r"  r  )r   r  r#  r   r   ru  r  r  rv  rw  r*  r  r  r  r  active_lossactive_logitsactive_labelsr   s                      r(   r{   z!MraForTokenClassification.forward  s   " &1%<kk$+B](())%'!5#  	
 	
 "!*,,7711'))H),11"55: &B @ @ %R%,x?T2U2U2]2]^d2e2e! !  x}==xB @ @&++b//RR 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r*   r  )r   r   r   r   r   r   rT   r8  r  r   rt  r   r{   r  r  s   @r(   r  r    s        	 	 	 	 	  -11515/3,004)-/3&*9
 9
EL)9
 !.9
 !.	9

 u|,9
 EL)9
  -9
 &9
 'tn9
 d^9
 
u++	,9
 9
 9
 ^9
 9
 9
 9
 9
r*   r  c                   P    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee         dee         de	e
ef         fd            Z xZS )MraForQuestionAnsweringc                    t                                          |           d|_        |j        | _        t          |          | _        t          j        |j        |j                  | _        | 	                                 d S )Nr7   )
r   r   r  r  r!   r   r  r   
qa_outputsr  r   s     r(   r   z MraForQuestionAnswering.__init__  sm        +F##)F$68IJJ 	r*   Nr  r#  r   r   ru  r  start_positionsend_positionsrv  rw  r2  c           
      f   |
|
n| j         j        }
|                     |||||||	|
          }|d         }|                     |          }|                    dd          \  }}|                    d          }|                    d          }d }||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|                    d|          }|                    d|          }t          |          } |||          } |||          }||z   dz  }|
s||f|dd          z   }||f|z   n|S t          ||||j        |j                  S )	Nr  r   r   r=   r;   )r  r7   )r  start_logits
end_logitsr"  r  )r   r  r!   r  splitr  r>   r?   clampr   r   r"  r  )r   r  r#  r   r   ru  r  r  r  rv  rw  r*  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                         r(   r{   zMraForQuestionAnswering.forward&  s    &1%<kk$+B](())%'!5#  	
 	
 "!*11#)<<r<#:#: j#++B//''++

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r*   )
NNNNNNNNNN)r   r   r   r   r   r   rT   r8  r  r   rt  r   r{   r  r  s   @r(   r  r    s4       
 
 
 
 
  -11515/3,0042604/3&*<
 <
EL)<
 !.<
 !.	<

 u|,<
 EL)<
  -<
 "%,/<
  -<
 'tn<
 d^<
 
u22	3<
 <
 <
 ^<
 <
 <
 <
 <
r*   r  )r  r  r  r  r  rW  r  r  r   )NN)r9   r   r   )Or
  r   pathlibr   typingr   r   rT   r   torch.nnr   r   r	   torch.utils.cpp_extensionr
   activationsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_mrar   
get_loggerr   r  r2   r4   rN   r^   rb   rj   rt   autogradFunctionrv   r   r   r   r   r   r  r   r  r-  r;  rI  rS  rW  rf  r}  r  r  r  r  r  r  r  r  r  r  __all__r$   r*   r(   <module>r     s            " " " " " " " "        A A A A A A A A A A * * * * * * ! ! ! ! ! ! 9 9 9 9 9 9                . - - - - - l l l l l l l l l l k k k k k k k k k k k k k k ( ( ( ( ( ( 
	H	%	%	C 	C 	C& & &8   .%O %O %O %OP% % % %Ps s sX X X X XEN3 X X X0] ] ] ] ]5>2 ] ] ].       :%V %V %V %VP#) #) #)Z !"$%p p p pf7 7 7 7 7BI 7 7 7t` ` ` ` `ry ` ` `H    BI       29   B    bi        	       )   :!
 !
 !
 !
 !
 !
 !
 !
J       $    ")   0! ! ! ! !RY ! ! ! % % % % % % % %2 d
 d
 d
 d
 d
! d
 d
 d
N D
 D
 D
 D
 D
' D
 D
 D
P    BI   *   L
 L
 L
 L
 L
#5 L
 L
 L
^ d
 d
 d
 d
 d
- d
 d
 d
N F
 F
 F
 F
 F
 2 F
 F
 F
R J
 J
 J
 J
 J
0 J
 J
 J
Z	 	 	r*   