
     `i;$                     ~   d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZ  ej        e          Z e ej                              Z  e!d e D                       Z"e G d d                      Z# G d de          Z$ G d de          Z%dS )    N)	dataclassfield)Enum)OptionalUnion)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)check_torch_load_is_safelogging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc              #   $   K   | ]}|j         V  d S N)
model_type).0confs     t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/data/datasets/squad.py	<genexpr>r   "   s$      EEDOEEEEEE    c                       e Zd ZU dZ edddd                    e          z   i          Zee	d<    edddi          Z
ee	d	<    ed
ddi          Zee	d<    ed
ddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    eddd i          Zee	d!<    ed"dd#i          Zee	d$<   dS )%SquadDataTrainingArgumentszb
    Arguments pertaining to what data we are going to input our model for training and eval.
    Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads)__name__
__module____qualname____doc__r   joinMODEL_TYPESr   str__annotations__r!   r#   intr$   r&   r(   r)   boolr*   r+   floatr-   r.   r0    r   r   r   r   %   s          e(KdiiXcNdNd(de  J    E(pq  Hc     %Q
  NC    ers  J    "E/
  c    #UJ
  s    "E)\ ]  OT    %*E)o p% % %T    (-uv'rs( ( (u    uf&qr  K    5C
  GS    5f6k-lmmmGSmmmmmr   r   c                       e Zd ZdZdZdS )SplittraindevN)r1   r2   r3   r?   r@   r<   r   r   r>   r>   h   s        E
CCCr   r>   c                       e Zd ZU dZeed<   ee         ed<   eed<   e	ed<   dej
        dddfded	ed
ee         deeef         de	dee         defdZd Zdeeej        f         fdZdS )SquadDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsfeaturesmodeis_language_sensitiveNFpt	tokenizerlimit_length	cache_dirdataset_formatc                 X   || _         || _        |j        rt                      nt	                      | _        t          |t                    r,	 t          |         }n# t          $ r t          d          w xY w|| _
        |j        rdnd}t          j                            ||n|j        d|j         d|j        j         d|j         d|           }	|	dz   }
t'          |
          5  t          j                            |	          r|j        st-          j                    }t/                       t1          j        |	d          | _        | j        d	         | _        | j                            d
d           | _        | j                            dd           | _        t>                               d|	 dt-          j                    |z
             | j        | j        t>          !                    d|	 d           n|t          j"        k    r%| j        #                    |j                  | _        n$| j        $                    |j                  | _        tK          | j        ||j        |j&        |j'        |t          j(        k    |j)        |          \  | _        | _        t-          j                    }t1          j*        | j        | j        | j        d|	           t>                               d|	 dt-          j                    |z
  dd           d d d            d S # 1 swxY w Y   d S )Nzmode is not a valid split namev2v1cached__z.lockT)weights_onlyrD   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rS   rH   r#   r$   r&   is_trainingr0   return_dataset)rD   rR   rS   z!Saving features into cached file z [took z.3fz s])+rC   rF   r*   r   r   	processor
isinstancer7   r>   KeyErrorrE   ospathr5   r!   value	__class__r1   r#   r   existsr)   timer   torchloadold_featuresrD   getrR   rS   loggerinfowarningr@   get_dev_examplesget_train_examplesr   r$   r&   r?   r0   save)selfrC   rH   rI   rE   rF   rJ   rK   version_tagcached_features_file	lock_pathstarts               r   __init__zSquadDataset.__init__w   s    	%:"/3/Kc)+++QaQcQcdC   	AAT{ A A A?@@@A	":Ddd!w||".IIDMedjee9#6#?ee$BUeeXcee 
  
 )72	i   -	 -	w~~233 ,D<P ,	(***$)J/CRV$W$W$W! !% 1* =#044YEE $ 1 5 5j$ G G]9M]]]_c_h_j_jmr_r   <'4=+@NN&0D & & &  
 59$$$(N$C$CDM$R$RDMM$(N$E$Edm$T$TDM.P!]'#'#6#%)%: $ 3 L#1	/ 	/ 	/+t| 	
!%4<UYUbcc(  
 q8LqqUYU^U`U`chUhqqqq  W-	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	s   A A5'H+LL#&L#c                 *    t          | j                  S r   )lenrD   )ri   s    r   __len__zSquadDataset.__len__   s    4=!!!r   returnc                 &   | j         |         }t          j        |j        t          j                  }t          j        |j        t          j                  }t          j        |j        t          j                  }t          j        |j        t          j                  }t          j        |j        t          j	                  }t          j        |j
        t          j	                  }|||d}	| j        j        dv r|	d= | j        j        dv r|	                    ||d           | j        j        r|	                    d|i           | j        rG|	                    dt          j        |j        t          j                  | j        j        z  i           | j        t*          j        k    rbt          j        |j        t          j                  }
t          j        |j        t          j                  }|	                    |
|d	           |	S )
N)dtype)	input_idsattention_masktoken_type_ids)xlmroberta
distilbert	camembertrw   )xlnetrx   )	cls_indexp_maskis_impossiblelangs)start_positionsend_positions)rD   r_   tensorru   longrv   rw   r}   r~   r;   r   rC   r   updater*   rF   onesshapeint64r.   rE   r>   r?   start_positionend_position)ri   ifeatureru   rv   rw   r}   r~   r   inputsr   r   s               r   __getitem__zSquadDataset.__getitem__   s   -"L!2%*EEE	g&<EJOOOg&<EJOOOL!2%*EEE	gnEK@@@W%:%+NNN #,,
 
 9#PPP'(9#333MM	VDDEEEy0 @>???) owIO5;)W)W)WZ^ZcZk)kmnnn9###l7+ATTTO!L)=UZPPPMMMoP]^^___r   )r1   r2   r3   r4   r   r8   listr   r>   r:   r?   r   r   r9   r   r7   rn   rq   dictr_   Tensorr   r<   r   r   rB   rB   m   s          %$$$=!!!!
KKK '+"'+&+#'"J J(J 'J sm	J
 CJJ  $J C=J J J J JX" " " S%,%6 7            r   rB   )&rY   r^   dataclassesr   r   enumr   typingr   r   r_   filelockr   torch.utils.datar	   models.auto.modeling_autor   tokenization_utilsr   utilsr   r   processors.squadr   r   r   r   
get_loggerr1   rc   r   keysMODEL_CONFIG_CLASSEStupler6   r   r>   rB   r<   r   r   <module>r      s   
			  ( ( ( ( ( ( ( (       " " " " " " " "        $ $ $ $ $ $ M M M M M M 5 5 5 5 5 5 6 6 6 6 6 6 6 6 t t t t t t t t t t t t 
	H	%	%tE@EGGHH eEE0DEEEEE ?n ?n ?n ?n ?n ?n ?n ?nD    D   
y y y y y7 y y y y yr   