
    &`i1                         d dl mZ d dlmZ d dlmZ d dl	m
Z
  e
d           G d de                      Z e
d           G d	 d
e                      Z e
d           G d de                      Zg dZdS )    )	LLMServer)DPServer)PDProxyServer)	PublicAPIalpha)	stabilityc                       e Zd ZdZdS )r   a  The implementation of the vLLM engine deployment.

    To build a Deployment object you should use `build_llm_deployment` function.
    We also expose a lower level API for more control over the deployment class
    through `serve.deployment` function.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.llm import LLMConfig
            from ray.serve.llm.deployment import LLMServer

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=dict(
                    served_model_name="llama-3.1-8b",
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1,
                        max_replicas=8,
                    )
                ),
            )

            # Build the deployment directly
            serve_options = LLMServer.get_deployment_options(llm_config)
            llm_app = serve.deployment(LLMServer).options(
                **serve_options).bind(llm_config)

            model_handle = serve.run(llm_app)

            # Query the model via `chat` api
            from ray.serve.llm.openai_api_models import ChatCompletionRequest
            request = ChatCompletionRequest(
                model="llama-3.1-8b",
                messages=[
                    {
                        "role": "user",
                        "content": "Hello, world!"
                    }
                ]
            )
            response = ray.get(model_handle.chat(request))
            print(response)
    N__name__
__module____qualname____doc__     l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/serve/llm/deployment.pyr   r      s        0 0d 	Dr   r   c                       e Zd ZdZdS )r   a  A proxy server for prefill-decode disaggregation.

    This server acts as a proxy in a prefill-decode disaggregated system.
    For chat and completions, proxy sends the request to the prefill server
    with max_tokens=1 and then sends the returned metadata to the decode server.

    Args:
        prefill_server: The prefill server deployment handle.
        decode_server: The decode server deployment handle.
    Nr
   r   r   r   r   r   H   s        	 	 	Dr   r   c                       e Zd ZdZdS )r   a  Data Parallel LLM Server.

    This class is used to serve data parallel attention (DP Attention)
    deployment paradigm, where the attention layers are replicated and
    the MoE layers are sharded. DP Attention is typically used for models
    like DeepSeek-V3.

    To build a Deployment object you should use `build_dp_deployment` function.
    We also expose a lower level API for more control over the deployment class
    through `serve.deployment` function.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.llm import LLMConfig, build_dp_deployment

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=dict(
                    model_id="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                engine_kwargs=dict(
                    data_parallel_size=2,
                    tensor_parallel_size=1,
                ),
                experimental_configs=dict(
                    dp_size_per_node=2,
                ),
                accelerator_type="A10G",
            )

            # Build the deployment
            dp_app = build_dp_deployment(llm_config)

            # Deploy the application
            model_handle = serve.run(dp_app)
    Nr
   r   r   r   r   r   X   s        & &P 	Dr   r   )r   r   r   N).ray.llm._internal.serve.core.server.llm_serverr   InternalLLMServer@ray.llm._internal.serve.serving_patterns.data_parallel.dp_serverr   	_DPServerAray.llm._internal.serve.serving_patterns.prefill_decode.pd_serverr   _PDProxyServerray.util.annotationsr   __all__r   r   r   <module>r      sS                  + * * * * * W3	 3	 3	 3	 3	! 3	 3	 3	l W	 	 	 	 	N 	 	 	 W)	 )	 )	 )	 )	y )	 )	 )	X 5
4
4r   