
     `i                        d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ ddl m!Z!  ej"        e#          Z$dZ%dZ&dZ'g dZ(dZ)dZ* G d dej+                  Z, G d dej+                  Z- G d dej+                  Z. G d dej+                  Z/ G d d ej+                  Z0 G d! d"ej+                  Z1 G d# d$ej+                  Z2 G d% d&ej+                  Z3 G d' d(e          Z4 G d) d*e          Z5d+Z6d,Z7 G d- d.e5          Z8 ed/e6           G d0 d1e5                      Z9 ed2e6           G d3 d4e5                      Z:g d5Z;dS )6zPyTorch M-CTC-T model.    N)OptionalUnion)nn   )ACT2FN)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputCausalLMOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)logging   )MCTCTConfigr   zspeechbrain/m-ctc-t-large)r      i   zY"Mr. Quilter is the apostle of the middle classes, and we're glad to welcome his gospel."gv@c                   (     e Zd ZdZ fdZd Z xZS )MCTCTConv1dSubsamplerz
    Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
    via gated linear units (https://huggingface.co/papers/1911.08460)
    c                     t                                                       | _        |j         _        t          j        |j                   _        |j	         _
        |j        |j        z   _         j
        dk    r#|j        t          d          |j         _        nd  _        |j        dz   _        |j         _        |j         _        t          j         fdt1           j                  D                        _        d S )Nr   zbNeed to specify `conv_channels` configuration in `MCTCTConfig` to use multiple convolution layers.   c              3      K   | ]c\  }}t          j        |d k    rj        nj        |         |j        dz
  k     rj        |         nj        |j        |         d          V  ddS )r   r   valid)kernel_sizestridepaddingN)r   Conv1din_channelsmid_channels
num_layersout_channelsr    ).0ikselfs      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py	<genexpr>z1MCTCTConv1dSubsampler.__init__.<locals>.<genexpr>U   s       	)
 	)
 1 I$%FF  0A!0D()DOa,?(?(?!!$$TEV{1~  	)
 	)
 	)
 	)
 	)
 	)
    )super__init__configconv_glu_dimglu_dimr   Dropoutconv_dropoutdropoutnum_conv_layersr%   input_feat_per_channelinput_channelsr#   conv_channels
ValueErrorr$   hidden_sizer&   conv_kernelr   conv_strider    
ModuleList	enumerateconv_layersr*   r0   	__class__s   ` r+   r/   zMCTCTConv1dSubsampler.__init__9   s   *z&"566 0!86;PP?Q#+   
 !' 4D $D".2!-(
 = 	)
 	)
 	)
 	)
 "$"233	)
 	)
 	)
 	
 	
r-   c                    t          d | j        D                       }t          j        j                            |dd||fdd          }|                    dd                                          }| j        D ]H} ||          }t          j        	                    || j
                  }|                     |          }I|                    dd                                          }|S )Nc              3       K   | ]	}|d z  V  
dS )r   N )r'   sizes     r+   r,   z0MCTCTConv1dSubsampler.forward.<locals>.<genexpr>c   s&      ==Ddai======r-   r   constantr   r   dim)sumr   torchr   
functionalpad	transpose
contiguousr@   glur2   r5   )r*   input_featuresr!   hidden_statesconvs        r+   forwardzMCTCTConv1dSubsampler.forward`   s     ==D,<=====,00!QQXAY[eghii&00A66AACC$ 	8 	8D D//MM--m-NNM LL77MM%//155@@BBr-   __name__
__module____qualname____doc__r/   rT   __classcell__rB   s   @r+   r   r   3   sR         
%
 %
 %
 %
 %
N      r-   r   c                   ,     e Zd ZdZ fdZ	 ddZ xZS )MCTCTEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t                      | _        t          j        |j                  | _        |                     dt%          j        |j                                      d          d           |                     dt%          j        | j                                        t$          j        | j        j                  d           d S )N)padding_idxposition_ids)r   F)
persistenttoken_type_idsdtypedevice)r.   r/   r   	Embedding
vocab_sizer;   pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsMCTCTLayerNorm	LayerNormr3   hidden_dropout_probr5   register_bufferrK   arangeexpandzerosr`   rF   longrf   rA   s     r+   r/   zMCTCTEmbeddings.__init__s   s1   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]"
 ())z&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 	K)..00
4K\Kcddd 	 	
 	
 	
 	
 	
r-   Nr   c                 >   ||                                 n|                                 d d         }|d         }|| j        d d |||z   f         }|mt          | d          r2| j        d d d |f         }|                    |d         |          }	|	}n+t          j        |t
          j        | j        j                  }|| 	                    |          }| 
                    |          }
||
z   }|                     |          }|                     |          }|S )Nra   r   rc   r   rd   )rF   r`   hasattrrc   rt   rK   ru   rv   rf   rj   rn   rp   r5   )r*   rQ   rc   r`   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrn   
embeddingss               r+   rT   zMCTCTEmbeddings.forward   sB    0>/In))+++}OaOaOcOcdgegdgOh ^
,QQQ0FVlIl0l-lmL
 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00@@M $ : :> J J"%::
^^J//
\\*--
r-   )NNNNr   rU   r[   s   @r+   r]   r]   p   sY        QQ
 
 
 
 
. wx       r-   r]   c                   >     e Zd Z fdZd Zd Zd Z	 	 	 ddZ xZS )	MCTCTSelfAttentionc                    t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        |j        | _        | j        | j        z  | _        t          j
        |j        | j        d          | _        t          j
        |j        | j        d          | _        t          j
        |j        | j        d          | _        t          j        |j                  | _        |j        | _        t          j        d|j        z  d	z
  | j                  | _        |j        | _        d S )
Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()Fbiasr   r   )r.   r/   r;   num_attention_headsrx   r:   attention_head_dimattention_head_sizeall_head_sizer   Linearquerykeyvaluer3   attention_probs_dropout_probr5   rk   rg   distance_embedding
is_decoderrA   s     r+   r/   zMCTCTSelfAttention.__init__   sS    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #)#< !58PPYv143EERRR
9V/1C%PPPYv143EERRR
z&"EFF'-'E$"$,q63Q/QTU/UW[Wo"p"p +r-   c                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )Nra   r   r   r      )rF   r   r   viewpermute)r*   xnew_x_shapes      r+   transpose_for_scoresz'MCTCTSelfAttention.transpose_for_scores   sM    ffhhssmt'?AY&ZZAFK yyAq!$$$r-   c           	      *   t          |j                  dk    r6 |j        t          t	          t          |j                                       }  |j        t          |           j        t          t	          t          |                               S )Nr   )lenshaper   reversedrangereshape)r*   r   r   s      r+   reshape_fortranz"MCTCTSelfAttention.reshape_fortran   sp    qw<<!	8E#ag,,$7$7889A2yqy(5//*2HU3u::=N=N4O4OPPr-   c           	         |                     dddd          }|j        \  }}}}t          j        |t          j        ||||f|j                  fd          }|                     ||||z   |z  d|g          }|d d d ||z   dz
  |z  f         }|                     ||||z   dz
  ||g          }|dz  }|d d |||z   f                             dd          }|                     dddd          S )Nr   r   r   r   rf   rH   )r   r   rK   catru   rf   r   rN   )r*   scoresbatchhidden_stateseq_lenheads	halfpoints          r+   "relative_position_embedding_rotatez5MCTCTSelfAttention.relative_position_embedding_rotate   s5    1a++.4l+|We FEK%0PY_Yf$g$g$ghnoppp %%fu|g7MQX6XZ[]b.cdd Cg4q8GCCCD %%fulW6Lq6PRY[`.abb A%	9y7':::;EEaKK~~aAq)))r-   NFc                 @   |                      |          }|t          j        | j                  z  }|                     |                     |                    }|                     |                     |                    }|                     |          }t          j        ||	                    dd                    }	| j
        j        }
t          j        d|
|	                    dd                    }|                     |          }|	|z   }	||	|z   }	t          j                            |	d          }|                     |          }|||z  }t          j        ||          }|                    dddd                              d	          }|r||fn|f}|S )
Nra   zlh, bche -> bcler   r   rH   r   r   )	start_dim)r   mathsqrtr   r   r   r   rK   matmulrN   r   weighteinsumr   r   rL   softmaxr5   r   flatten)r*   rR   attention_mask	head_maskoutput_attentionsmixed_query_layer	key_layervalue_layerquery_layerattention_scorespositional_embeddingrelative_position_scoresattention_probscontext_layeroutputss                  r+   rT   zMCTCTSelfAttention.forward   s    !JJ}55-	$:R0S0SS--dhh}.E.EFF	//

=0I0IJJ//0ABB !<Y5H5HR5P5PQQ  $6=#(<0BDXZeZoZopqstZuZu#v#v #'#J#JKc#d#d +.FF%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99AABAOO6G]=/22mM]r-   NNF)	rV   rW   rX   r/   r   r   r   rT   rZ   r[   s   @r+   r   r      s        , , , , ,.% % %
Q Q Q
* * *8 . . . . . . . .r-   r   c                   $     e Zd Z fdZd Z xZS )ro   c                     t                                                       t          j        t	          j        d                    | _        t          j        t	          j        d                    | _        d S Nr   )	r.   r/   r   	ParameterrK   onessingleton_weightru   singleton_bias)r*   rB   s    r+   r/   zMCTCTLayerNorm.__init__  sS     "UZ]] ; ; l5;q>>::r-   c                 &    || j         z  | j        z   S N)r   r   r*   rR   s     r+   rT   zMCTCTLayerNorm.forward  s     559LLLr-   rV   rW   rX   r/   rT   rZ   r[   s   @r+   ro   ro     sN        ; ; ; ; ;
M M M M M M Mr-   ro   c                   $     e Zd Z fdZd Z xZS )MCTCTSelfOutputc                 (   t                                                       || _        t          j        |j        |j        d          | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S NFr   )eps)r.   r/   r0   r   r   r;   denserp   layer_norm_epsr3   rq   r5   rA   s     r+   r/   zMCTCTSelfOutput.__init__!  sr    Yv163EERRR
f&8f>STTTz&"<==r-   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r5   rp   r*   rR   input_tensors      r+   rT   zMCTCTSelfOutput.forward(  @    

=11]33}|'CDDr-   r   r[   s   @r+   r   r      sG        > > > > >      r-   r   c                   2     e Zd Z fdZd Z	 	 	 ddZ xZS )MCTCTAttentionc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r   )r.   r/   r   r*   r   outputsetpruned_headsrA   s     r+   r/   zMCTCTAttention.__init__0  sI    &v..	%f--EEr-   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rH   )r   r   r*   r   r   r   r   r   r   r   r   r   r   union)r*   r   indexs      r+   prune_headszMCTCTAttention.prune_heads6  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r-   NFc                     |                      ||||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )r*   r   )r*   rR   r   r   r   self_outputsattention_outputr   s           r+   rT   zMCTCTAttention.forwardH  sX     yy	
 
  ;;|AFF#%QRR(88r-   r   )rV   rW   rX   r/   r   rT   rZ   r[   s   @r+   r   r   /  sf        " " " " "; ; ;*        r-   r   c                   $     e Zd Z fdZd Z xZS )MCTCTIntermediatec                    t                                                       t          j        |j        |j        d          | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S )NFr   )r.   r/   r   r   r;   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrA   s     r+   r/   zMCTCTIntermediate.__init__\  st    Yv163KRWXXX
f'-- 	9'-f.?'@D$$$'-'8D$$$r-   c                 Z    |                      |          }|                     |          }|S r   )r   r   r   s     r+   rT   zMCTCTIntermediate.forwardd  s,    

=1100??r-   r   r[   s   @r+   r   r   [  sG        9 9 9 9 9      r-   r   c                   $     e Zd Z fdZd Z xZS )MCTCTOutputc                    t                                                       t          j        |j        |j        d          | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )r.   r/   r   r   r   r;   r   rp   r   r3   rq   r5   rA   s     r+   r/   zMCTCTOutput.__init__k  sl    Yv79KRWXXX
f&8f>STTTz&"<==r-   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r   s      r+   rT   zMCTCTOutput.forwardq  r   r-   r   r[   s   @r+   r   r   j  sG        > > > > >      r-   r   c                   8     e Zd Zdef fdZ	 	 	 ddZd Z xZS )
MCTCTLayerr0   c                     t                                                       d| _        |j        | _        t	          |          | _        t          |          | _        |j        | _        t          |          | _
        d S r   )r.   r/   seq_len_dimchunk_size_feed_forwardr   intermediater   	attentionr   r   r   rA   s     r+   r/   zMCTCTLayer.__init__y  sh    '-'E$-f55'// +!&))r-   NFc                     |                      ||||          }|d         }|dd          }t          | j        | j        | j        |          }|f|z   }|S )N)r   r   r   )r   r   feed_forward_chunkr   r   )	r*   rR   r   r   r   self_attention_outputsr   r   layer_outputs	            r+   rT   zMCTCTLayer.forward  sw     "&>9HY "0 "
 "
 2!4(,0#T%A4CSUe
 
  /G+r-   c                 \    |                      |          }|                     ||          }|S r   )r   r   )r*   r   intermediate_outputr   s       r+   r   zMCTCTLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr-   r   )rV   rW   rX   r   r/   rT   r   rZ   r[   s   @r+   r   r   x  sr        	*{ 	* 	* 	* 	* 	* 	*    *      r-   r   c                   L    e Zd ZU dZeed<   dZdZdZd Z	de
j        fdZd	 Zd
S )MCTCTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r0   mctctrQ   Tc                    | j         j        }t          |t          j                  rH|j        j                            d|           |j        |j        j        	                                 nt          |t          j
                  rR|j        j                            d|           |j        )|j        j        |j                 	                                 nt          |t          j                  r>|j        j        	                                 |j        j                            d           nRt          |t                    r=|j        j                            d           |j        j        	                                 t          |t          j        t          j        f          rH|j        j                            d|           |j        "|j        j        	                                 dS dS dS )zInitialize the weightsg        )meanstdNg      ?)r0   initializer_ranger   r   r   r   datanormal_r   zero_rg   r_   rp   fill_ro   r   r   r"   )r*   moduler  s      r+   _init_weightsz"MCTCTPreTrainedModel._init_weights  s   k+fbi(( 	/ M&&CS&999{& &&(((-- 		/M&&CS&999!-"6#56<<>>>-- 	/K""$$$M$$S))))// 	/#(..s333!&,,...fry")455 	)M&&CS&999{& &&(((((	) 	)&&r-   input_lengthsc                     d}t          t          | j        j                  | j        j        | j        j                  D ]9\  }}}|dz  }|d|z  z   ||dz
  z  z
  dz
  }t          j        ||d          dz   }:|S )zH
        Computes the output length of the convolutional layers
        r   r   trunc)rounding_mode)zipr   r0   r6   r<   r=   rK   div)r*   r  dilation_	kernel_szr    r!   s          r+    _get_feat_extract_output_lengthsz5MCTCTPreTrainedModel._get_feat_extract_output_lengths  s     $'$+-..0GI`%
 %
 	X 	X Ay&  1nG)AK7(iRSm:TTWXXM!ImV7SSSVWWMMr-   c                    t          |j                  dk    r|d d d d df         }|                     |                    d                    }|                                d         }t          j        ||f|j        |j                  }d|t          j	        ||j                  |dz
  f<   |
                    dg                              d          
                    dg                                          }|S )Nr   ra   r   rd   r   r   )r   r   r  rJ   rF   rK   ru   re   rf   rs   flipcumsumrv   )r*   feature_vector_lengthr   subsampled_lengthsbszs        r+   "_get_feature_vector_attention_maskz7MCTCTPreTrainedModel._get_feature_vector_attention_mask  s    ~#$$q((+AAAqqq"H5N "BB>CUCUVXCYCYZZ!!##A&'(0D^Mb
 
 
 efS1FGGGI[^_I_`a',,bT2299"==BBB4HHMMOOr-   N)rV   rW   rX   rY   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr  rK   
LongTensorr  r  rE   r-   r+   r  r    s{          
 &O&*#) ) )0e>N        r-   r  aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MCTCTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_features (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`Wav2Vec2CTCTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
c                   ~     e Zd Zdef fdZ	 	 	 ddej        dej        dej        ded	ed
edee	e
f         fdZ xZS )MCTCTEncoderr0   c                 .   t                                                     j        | _        t                      | _        t                    | _        t          j        fdt          j
                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rE   )r   )r'   r  r0   s     r+   
<listcomp>z)MCTCTEncoder.__init__.<locals>.<listcomp>  s!    $a$a$aAZ%7%7$a$a$ar-   F)r.   r/   rq   ro   
layer_normr   rS   r   r>   r   num_hidden_layerslayersgradient_checkpointingrA   s    `r+   r/   zMCTCTEncoder.__init__  s       #)#= (**)&11	m$a$a$a$avG_A`A`$a$a$abb&+###r-   FTrQ   r   r   r   output_hidden_statesreturn_dictreturnc                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                     |          }|!|                     |j        d         |          }t          j	        
                    || j        | j                  }|t          ||j                  }|rdnd }	|rdnd }
|p|                                d         t!          | j                  k    r@t%          dt!          | j                   d|                                d          d          t'                      pt)          |           }t+          | j                  D ]d\  }}|r|	|fz   }	t-          j        g           }| j        o|| j         j        k     }|r|r ||||          }|d         }|rd	}|r|
|d         fz   }
e|r|	|fz   }	|st3          d
 ||	|
fD                       S t5          ||	|
          S )Nr   )ptrainingrE   r   z&The head_mask should be specified for z layers, but it is for .)rR   r   r   )NNc              3      K   | ]}||V  	d S r   rE   )r'   vs     r+   r,   z'MCTCTEncoder.forward.<locals>.<genexpr>`  s(      eeqWXWdWdWdWdWdeer-   last_hidden_staterR   
attentions)r0   r   r,  use_return_dictr(  rS   r  r   r   rL   r5   rq   r1  r   re   rF   r   r*  r:   r   r   r?   rK   rand	layerdroptupler   )r*   rQ   r   r   r   r,  r-  ry   rR   encoder_statesall_attentionssynced_gpusidxencoder_layerdropout_probabilityskip_the_layerlayer_outputss                    r+   rT   zMCTCTEncoder.forward  s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]88		.11 %!DD]EXYZE[]kllN--mt?Wbfbo-pp %7H[\\N3=0:d  ~~"c$+&6&666 <S=M=M < <%.^^%5%5a%8< < <  
 122R6LT6R6R"+DK"8"8 	F 	FC# C!/=2B!B #(*R..!]Z/BT[EZ/ZN! 1[ 1 -"/#1&7! ! ! !.a 0 - ,  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee+>Vd
 
 
 	
r-   )FFT)rV   rW   rX   r   r/   rK   Tensorboolr   r;  r   rT   rZ   r[   s   @r+   r$  r$    s        ,{ , , , , , , #(%* I
 I
I
 I
 <	I

  I
 #I
 I
 
uo%	&I
 I
 I
 I
 I
 I
 I
 I
r-   r$  zaThe bare M-CTC-T Model transformer outputting raw hidden-states without any specific head on top.c                   ,    e Zd Z fdZ ee                    d                     eee	e
de          	 	 	 	 	 ddej        deej                 deej                 d	ee         d
ee         dee         deee	f         fd                        Z xZS )
MCTCTModelc                     t                                          |           || _        t          |          | _        |                                  d S r   )r.   r/   r0   r$  encoder	post_initrA   s     r+   r/   zMCTCTModel.__init__k  sI       #F++ 	r-   zbatch_size, sequence_lengthaudio)
checkpointoutput_typeconfig_classmodalityexpected_outputNrQ   r   r   r   r,  r-  r.  c                 "   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     ||||||          }|d         }|s|f|dd          z   S t          ||j        |j                  S )Nz#You have to specify input_features.r   r   r   r,  r-  r   r   r5  )	r0   r   r,  r8  r:   rI  r   rR   r7  )	r*   rQ   r   r   r   r,  r-  encoder_outputssequence_outputs	            r+   rT   zMCTCTModel.forwardt  s    " 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]!BCCC,,)/!5# ' 
 
 *!, 	<#%(;;;-)7&1
 
 
 	
r-   )NNNNN)rV   rW   rX   r/   r
   MCTCT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPErK   rD  r   rE  r   r;  rT   rZ   r[   s   @r+   rG  rG  f  s#       
     +*+A+H+HIf+g+ghh&#$.   26,0,0/3&*#
 #
#
 !.#
 EL)	#

 $D>#
 'tn#
 d^#
 
uo%	&#
 #
 #
  ih#
 #
 #
 #
 #
r-   rG  zcMCTCT Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).c                   "    e Zd Z fdZ ee           eeee	e
e          	 	 	 	 	 	 ddej        deej                 deej                 dee         dee         d	ee         d
eej                 deeef         fd                        Z xZS )MCTCTForCTCc                 $   t                                          |           t          |          | _        |j        t          d| j         d          |j        }t          j	        ||j                  | _
        |                                  d S )NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `MCTCTForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)r.   r/   rG  r  rh   r:   rB   r;   r   r   ctc_headrJ  )r*   r0   output_hidden_sizerB   s      r+   r/   zMCTCTForCTC.__init__  s       ''
$H H H H   $/	"4f6GHH 	r-   )rL  rM  rN  rP  expected_lossNrQ   r   r   r   r,  r-  labelsr.  c           
      b   |>|                                 | j        j        k    rt          d| j        j                   ||n| j        j        }|                     ||||||          }|d         }	|                     |	          }
d}|g||n,t          j        |j	        dd         t          j
                  }|                     |                    d                                        t          j
                  }|dk    }|                    d          }|                    |          }t          j                            |
dt          j                                      dd          }t          j        j                            d	
          5  t          j                            ||||| j        j        | j        j        | j        j                  }ddd           n# 1 swxY w Y   |s|
f|t6          d         z   }||f|z   n|S t9          ||
|j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: rR  r   ra   )re   )rI   re   r   F)enabled)blank	reductionzero_infinity)losslogitsrR   r7  )maxr0   rh   r:   r8  r  r]  rK   r   r   rv   r  rJ   tomasked_selectr   rL   log_softmaxfloat32rN   backendscudnnflagsctc_lossri   ctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rR   r7  )r*   rQ   r   r   r   r,  r-  r`  r   rR   rg  rf  r  labels_masktarget_lengthsflattened_targets	log_probsr   s                     r+   rT   zMCTCTForCTC.forward  s   2 &**,,$+2H"H"H\DKDZ\\]]]%0%<kk$+B]**)/!5#  
 
  
}-- "- Z 4SbS 9LLL 
 !AA.BTBTUWBXBXYY\\]b]ghhM !A+K(__R00N & 4 4[ A A 11&b1VV``abdeffI%++E+:: 	 	}--%!"+2"k<"&+"? .  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  	FY)F)G)G!HHF)-)9TGf$$vEfG4IV]Vh
 
 
 	
s   AG**G.1G.)NNNNNN)rV   rW   rX   r/   r
   rU  r   rW  r   rX  _CTC_EXPECTED_OUTPUT_CTC_EXPECTED_LOSSrK   rD  r   rE  r"  r   r;  rT   rZ   r[   s   @r+   r[  r[    s:       
    & +*+ABB&"$,(   26,0,0/3&*-1E
 E
E
 !.E
 EL)	E

 $D>E
 'tnE
 d^E
 )*E
 
un$	%E
 E
 E
  CBE
 E
 E
 E
 E
r-   r[  )r[  rG  r  )<rY   r   typingr   r   rK   r   activationsr   
file_utilsr   r	   r
   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   configuration_mctctr   
get_loggerrV   loggerrs  rX  rW  rY  rx  ry  Moduler   r]   r   ro   r   r   r   r   r   r  MCTCT_START_DOCSTRINGrU  r$  rG  r[  __all__rE   r-   r+   <module>r     s;      " " " " " " " "        " " " " " " r r r r r r r r r r A A A A A A 8 8 8 8 8 8 C C C C C C : : : : : : @ @ @ @ @ @ @ @ . . . . . . m m m m m m m m m m       , , , , , , 
	H	%	% !  2 '  t  : : : : :BI : : :z7 7 7 7 7bi 7 7 7ti i i i i i i iXM M M M MRY M M M    bi   ) ) ) ) )RY ) ) )X    	       ")   $ $ $ $ $+ $ $ $NB B B B B? B B BJ	  @T
 T
 T
 T
 T
' T
 T
 T
n g 5
 5
 5
 5
 5
% 5
 5
	 5
p m a
 a
 a
 a
 a
& a
 a
	 a
H @
?
?r-   