
    &`i1                        d dl mZmZmZ d dlmZ d dlmZm	Z
mZmZ d dlmZ d dlmZ d dlmZ d dlmZ erd dlmZ  ed	
           G d de
                      Z	 ed	
           G d de                      Z ed	
           G d de                      Z ed	
           G d de                      Z ed	
           G d de                      Z eddd           G d de                      Z eddd           G d de                      Z ed	
          ddddd d!dd"ee         d#ee         d$ee         d%eee                  d&d'fd(            Z ed	
          d)ed&d'fd*            Z  ed	
          d+ed&d'fd,            Z! ed	
          ddd-d!dd"ee         d$ee         d&d'fd.            Z" ed	
          d/ed&d'fd0            Z#g d1Z$dS )2    )TYPE_CHECKINGOptionalType)
Deprecated)CloudMirrorConfig	LLMConfig
LoraConfigModelLoadingConfig)LLMServingArgs)OpenAiIngress)	LLMServer)	PublicAPI)Applicationalpha)	stabilityc                       e Zd ZdZdS )r   z1The configuration for starting an LLM deployment.N__name__
__module____qualname____doc__     j/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/serve/llm/__init__.pyr   r       s        ;;Dr   r   c                       e Zd ZdZdS )r   z=The configuration for starting an LLM deployment application.Nr   r   r   r   r   r   '   s        GGDr   r   c                       e Zd ZdZdS )r
   z+The configuration for loading an LLM model.Nr   r   r   r   r
   r
   .   s        55Dr   r
   c                       e Zd ZdZdS )r   z@The configuration for mirroring an LLM model from cloud storage.Nr   r   r   r   r   r   5   s        JJDr   r   c                       e Zd ZdZdS )r	   z5The configuration for loading an LLM model with LoRA.Nr   r   r   r   r	   r	   <   s        ??Dr   r	   zray.serve.llm.LLMServerz"ray.serve.llm.deployment.LLMServerF)oldnewerrorc                       e Zd ZdS )r   Nr   r   r   r   r   r   r   r   H   s         	Dr   r   zray.serve.llm.LLMRouterz#ray.serve.llm.ingress.OpenAIIngressc                       e Zd ZdS )	LLMRouterNr#   r   r   r   r%   r%   O   s         	Dr   r%   N)name_prefixbind_kwargsoverride_serve_optionsdeployment_cls
llm_configr&   r'   r(   r)   returnr   c                .    ddl m}  || ||||          S )aq  Helper to build a single vllm deployment from the given llm config.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.llm import LLMConfig, build_llm_deployment

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=dict(
                    model_id="llama-3.1-8b",
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1,
                        max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Build the deployment
            llm_app = build_llm_deployment(llm_config)

            # Deploy the application
            model_handle = serve.run(llm_app)

            # Querying the model handle
            import asyncio
            model_handle = model_handle.options(stream=True)
            async def query_model(model_handle):
                from ray.serve.llm.openai_api_models import ChatCompletionRequest

                request = ChatCompletionRequest(
                    model="qwen-0.5b",
                    messages=[
                        {
                            "role": "user",
                            "content": "Hello, world!"
                        }
                    ]
                )

                resp = model_handle.chat.remote(request)
                async for message in resp:
                    print("message: ", message)

            asyncio.run(query_model(model_handle))

    Args:
        llm_config: The llm config to build vllm deployment.
        name_prefix: Optional prefix to be used for the deployment name.
        bind_kwargs: Optional kwargs to pass to the deployment.
        override_serve_options: Optional serve options to override the original serve options based on the llm_config.
        deployment_cls: Optional deployment class to use.

    Returns:
        The configured Ray Serve Application for vllm deployment.
    r   )build_llm_deployment)r*   r&   r'   r(   r)   )+ray.llm._internal.serve.core.server.builderr-   )r*   r&   r'   r(   r)   r-   s         r   r-   r-   ]   sL    N       5%   r   llm_serving_argsc                 &    ddl m}  ||           S )a  Helper to build an OpenAI compatible app with the llm deployment setup from
    the given llm serving args. This is the main entry point for users to create a
    Serve application serving LLMs.


    Examples:
        .. code-block:: python
            :caption: Example usage in code.

            from ray import serve
            from ray.serve.llm import LLMConfig, LLMServingArgs, build_openai_app

            llm_config1 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-0.5b",
                    model_source="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            llm_config2 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-1.5b",
                    model_source="Qwen/Qwen2.5-1.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Deploy the application
            llm_app = build_openai_app(
                LLMServingArgs(
                    llm_configs=[
                        llm_config1,
                        llm_config2,
                    ]
                )
            )
            serve.run(llm_app)


            # Querying the model via openai client
            from openai import OpenAI

            # Initialize client
            client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")

            # Basic completion
            response = client.chat.completions.create(
                model="qwen-0.5b",
                messages=[{"role": "user", "content": "Hello!"}]
            )

        .. code-block:: yaml
            :caption: Example usage in YAML.

            # config.yaml
            applications:
            - args:
                llm_configs:
                    - model_loading_config:
                        model_id: qwen-0.5b
                        model_source: Qwen/Qwen2.5-0.5B-Instruct
                      accelerator_type: A10G
                      deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
                    - model_loading_config:
                        model_id: qwen-1.5b
                        model_source: Qwen/Qwen2.5-1.5B-Instruct
                      accelerator_type: A10G
                      deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
              import_path: ray.serve.llm:build_openai_app
              name: llm_app
              route_prefix: "/"


    Args:
        llm_serving_args: A dict that conforms to the LLMServingArgs pydantic model.

    Returns:
        The configured Ray Serve Application router.
    r   )build_openai_appbuilder_config),ray.llm._internal.serve.core.ingress.builderr1   )r/   r1   s     r   r1   r1      s8    D      +;<<<<r   pd_serving_argsc                 &    ddl m}  ||           S )a$
  Build a deployable application utilizing P/D disaggregation.


    Examples:
        .. code-block:: python
            :caption: Example usage in code.

            from ray import serve
            from ray.serve.llm import LLMConfig, build_pd_openai_app

            config = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-0.5b",
                    model_source="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Deploy the application
            llm_app = build_pd_openai_app(
                dict(
                    prefill_config=config,
                    decode_config=config,
                )
            )

            serve.run(llm_app)


            # Querying the model via openai client
            from openai import OpenAI

            # Initialize client
            client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")

            # Basic completion
            response = client.chat.completions.create(
                model="qwen-0.5b",
                messages=[{"role": "user", "content": "Hello!"}]
            )

        .. code-block:: yaml
            :caption: Example usage in YAML.

            # config.yaml
            applications:
            - args:
                prefill_config:
                    model_loading_config:
                        model_id: qwen-0.5b
                        model_source: Qwen/Qwen2.5-0.5B-Instruct
                    accelerator_type: A10G
                    deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
                decode_config:
                    model_loading_config:
                    model_id: qwen-1.5b
                    model_source: Qwen/Qwen2.5-1.5B-Instruct
                    accelerator_type: A10G
                    deployment_config:
                    autoscaling_config:
                        min_replicas: 1
                        max_replicas: 2
              import_path: ray.serve.llm:build_pd_openai_app
              name: llm_app
              route_prefix: "/"


    Args:
        pd_serving_args: The dictionary containing prefill and decode configs. See PDServingArgs for more details.

    Returns:
        The configured Ray Serve Application router.
    r   )build_pd_openai_app)r5   )?ray.llm._internal.serve.serving_patterns.prefill_decode.builderr7   )r5   r7   s     r   r7   r7     s7    f      ????r   )r&   r(   c                *    ddl m}  || ||          S )aq  Build a data parallel attention LLM deployment.

    Args:
        llm_config: The LLM configuration.
        name_prefix: The prefix to add to the deployment name.
        override_serve_options: The optional serve options to override the
            default options.

    Returns:
        The Ray Serve Application for the data parallel attention LLM deployment.
    r   )build_dp_deployment)r*   r&   r(   )>ray.llm._internal.serve.serving_patterns.data_parallel.builderr:   )r*   r&   r(   r:   s       r   r:   r:   t  sE    $      5   r   dp_serving_argsc                 &    ddl m}  ||           S )a@  Build an OpenAI compatible app with the DP attention deployment
    setup from the given builder configuration.

    Args:
        dp_serving_args: The configuration for the builder. It has to conform
            to the DPOpenAiServingArgs pydantic model.

    Returns:
        The configured Ray Serve Application.
    r   )build_dp_openai_appr2   )r;   r>   )r<   r>   s     r   r>   r>     s6          o>>>>r   )r   r   r
   r   r	   r-   r1   r7   r:   r>   r   r%   )%typingr   r   r   ray._common.deprecationr   /ray.llm._internal.serve.core.configs.llm_configr   _CloudMirrorConfigr   
_LLMConfigr	   _LoraConfigr
   _ModelLoadingConfigr4   r   _LLMServingArgs,ray.llm._internal.serve.core.ingress.ingressr   _OpenAiIngress.ray.llm._internal.serve.core.server.llm_serverr   
_LLMServerray.util.annotationsr   ray.serve.deploymentr   r%   strdictr-   r1   r7   r:   r>   __all__r   r   r   <module>rP      s   0 0 0 0 0 0 0 0 0 0 . . . . . .                     
      + * * * * * 1000000 W	 	 	 	 	
 	 	 	 W	 	 	 	 	_ 	 	 	 W	 	 	 	 	, 	 	 	 W	 	 	 	 	* 	 	 	 W	 	 	 	 	 	 	 	 !'KSX  	 	 	 	 	
 	 	 	 !-
  
	 	 	 	 	 	 	 
	 W "&"&-104P P PP #P $	P
 %TNP T)_-P P P P Pf We=t e= e= e= e= e=P WV@ V@- V@ V@ V@ V@r W "&-1	   # %TN	
    8 W? ?- ? ? ? ?$  r   