
    &`iS                         d Z ddlmZmZ ddlmZ ddlmZmZm	Z	 ddl
mZmZ  G d de          Z G d d	e          Zd
S )zJ
This module defines a dataset framework for sampling benchmark requests.
    )ABCabstractmethod)Path)DictListOptional)load_datasetload_from_diskc                   |    e Zd ZdZdefdee         deddfdZed
d            Z	edede
e         fd	            ZdS )BenchmarkDatasetr   Ndataset_pathrandom_seedreturnc                 "    || _         || _        dS )a>  
        Abstract base class for benchmark datasets.

        All benchmark datasets should inherit from this class and implement
        the required abstract methods.

        Args:
            dataset_path: The path to the dataset on disk.
            random_seed: The seed for the random number generator.
        N)_dataset_path_random_seed)selfr   r   s      }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/llm/_internal/batch/benchmark/dataset.py__init__zBenchmarkDataset.__init__   s     *'    c                      t          d          )z
        Load data from the dataset source into memory.

        Raises:
            NotImplementedError: If the method is not implemented in subclasses.
        z,load_data must be implemented in subclasses.NotImplementedErrorr   s    r   	load_datazBenchmarkDataset.load_data!   s     ""PQQQr   num_requestsc                      t          d          )a<  
        Sample prompts from the loaded dataset.

        Args:
            num_requests: The number of prompts to sample from the dataset.

        Returns:
            A list of sampled request dictionaries.

        Raises:
            NotImplementedError: If the method is not implemented in subclasses.
        z)sample must be implemented in subclasses.r   )r   r   s     r   samplezBenchmarkDataset.sample+   s     ""MNNNr   r   N)__name__
__module____qualname__DEFAULT_RANDOM_SEEDr   strintr   r   r   r   r   r    r   r   r   r      s         '+.( (sm( ( 
	( ( ( ($ R R R ^R O3 O4: O O O ^O O Or   r   c                        e Zd ZdZ	 	 	 ddedededed	ee         d
df fdZddZded
e	e
         fdZd Zd
e	e
         fdZde
d
e
dz  fdZ xZS )ShareGPTDatasetzhImplements the ShareGPT dataset. The first human message of each conversation is used to build a prompt.,Crystalcareai/Code-feedback-sharegpt-renamedtrainNr   seedhf_dataset_idhf_splittruncate_promptr   c                     t                                          ||           || _        || _        || _        || _        d| _        dS )a  
        Initializes the ShareGPTDataset.

        Args:
            dataset_path: The path to the dataset on disk.
            seed: The seed for the random number generator.
            hf_dataset_id: The Hugging Face dataset ID to download if the dataset is not found on disk.
            hf_split: The Hugging Face split to load from the dataset.
            truncate_prompt: Maximum prompt length so that the prompt fits in the model's context window.
        N)superr   _seed_hf_dataset_id	_hf_split_truncate_prompt_data)r   r   r+   r,   r-   r.   	__class__s         r   r   zShareGPTDataset.__init__?   sH    $ 	t,,,
+! /(,


r   c                 J    | j         |                                 | _         dS dS )z,Load data from the dataset path into memory.N)r5   _load_dataset_datar   s    r   r   zShareGPTDataset.load_dataZ   s)    :0022DJJJ r   r   c                     | j         |                                  g }| j         D ]C}t          |          |k    r n-|                     |          }||                    |           D|st          d          |S )z'Sample prompts from the loaded dataset.Nz*ShareGPT dataset yielded no usable prompts)r5   r   len_extract_promptappend
ValueError)r   r   promptsitemprompt_datas        r   r   zShareGPTDataset.sample_   s    :NNJ 	, 	,D7|||++..t44K&{+++ 	KIJJJr   c                 &   t          | j                  }t          d|            t          d|                                            	 |                                rt	          t          |                    }npt          d| j                    |j                            dd           t          | j        | j
                  }|                    t          |                     |S # t          $ r}t          d|           d}~ww xY w)	z'Load dataset from disk or Hugging Face.z Attempting to load dataset from zDataset exists on disk: z:Dataset not found on disk, downloading from Hugging Face: T)parentsexist_ok)splitz Error loading ShareGPT dataset: N)r   r   printexistsr
   r$   r2   parentmkdirr	   r3   save_to_disk	ExceptionRuntimeError)r   pathdatasetes       r   _load_datasetzShareGPTDataset._load_datasetq   s!   D&''777888888999	G{{}} 	0(T33fQUQdff   !!$!>>>&t':$.QQQ$$SYY///N 	G 	G 	GE!EEFFF	Gs   B"C/ /
D9DDc                     |                                                      | j                  }g }t          |          D ]\  }}|                    |           t          dt          |           d           |S )z:Load and process dataset data into a list of dictionaries.)r+   zLoaded z samples from dataset)rO   shuffler1   	enumerater<   rE   r:   )r   dsdatairows        r   r8   z"ShareGPTDataset._load_dataset_data   s~    !!))tz)::mm 	 	FAsKK8D		888999r   r?   c                     |                     d          p|                     d          pg }t          d |D             d          }|r.|                                r| j        r|d| j                 }d|iS dS )z
        Extracts the first human message of a conversation or None.

        The ShareGPT schema uses {"role": "human", "value": ...} for user
        turns.
        messagesconversationsc              3      K   | ]P}|                     d           dv t          |                     dd                                                    V  QdS )role>   userhumanvalue N)getr$   strip).0msgs     r   	<genexpr>z2ShareGPTDataset._extract_prompt.<locals>.<genexpr>   sf        776??&777 CGGGR(())//117777 r   Nprompt)r`   nextra   r4   )r   r?   rX   re   s       r   r;   zShareGPTDataset._extract_prompt   s     88J''J488O+D+DJ #  
 
 
  	&fllnn 	&$ 9 7$"7 78f%%tr   )r)   r*   Nr   )r    r!   r"   __doc__r$   r%   r   r   r   r   r   r   rO   r8   r;   __classcell__)r6   s   @r   r(   r(   <   s#       rr L)-- -- - 	-
 - "#- 
- - - - - -63 3 3 3
3 4:    $G G G,	DJ 	 	 	 	D TD[        r   r(   N)rg   abcr   r   pathlibr   typingr   r   r   datasetsr	   r
   r   r(   r&   r   r   <module>rm      s     $ # # # # # # #       ' ' ' ' ' ' ' ' ' ' 1 1 1 1 1 1 1 1-O -O -O -O -Os -O -O -O`m m m m m& m m m m mr   