
    &`i
                     j    d dl mZmZ d dlmZ  ed           G d de                      ZddgZdS )	    )OpenAiIngressmake_fastapi_ingress)	PublicAPIalpha)	stabilityc                       e Zd ZdZdS )r   a	  The implementation of the OpenAI compatible model router.

    This deployment creates the following endpoints:
      - /v1/chat/completions: Chat interface (OpenAI-style)
      - /v1/completions: Text completion
      - /v1/models: List available models
      - /v1/models/{model}: Model information
      - /v1/embeddings: Text embeddings
      - /v1/audio/transcriptions: Audio transcription
      - /v1/score: Text scoring


    Examples:
        .. testcode::
            :skipif: True


            from ray import serve
            from ray.serve.llm import LLMConfig
            from ray.serve.llm.deployment import LLMServer
            from ray.serve.llm.ingress import OpenAiIngress, make_fastapi_ingress

            llm_config1 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-0.5b",
                    model_source="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            llm_config2 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-1.5b",
                    model_source="Qwen/Qwen2.5-1.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # deployment #1
            server_options1 = LLMServer.get_deployment_options(llm_config1)
            server_deployment1 = serve.deployment(LLMServer).options(
                **server_options1).bind(llm_config1)

            # deployment #2
            server_options2 = LLMServer.get_deployment_options(llm_config2)
            server_deployment2 = serve.deployment(LLMServer).options(
                **server_options2).bind(llm_config2)

            # ingress
            ingress_options = OpenAiIngress.get_deployment_options(
                llm_configs=[llm_config1, llm_config2])
            ingress_cls = make_fastapi_ingress(OpenAiIngress)
            ingress_deployment = serve.deployment(ingress_cls).options(
                **ingress_options).bind([server_deployment1, server_deployment2])

            # run
            serve.run(ingress_deployment, blocking=True)
    N)__name__
__module____qualname____doc__     i/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/serve/llm/ingress.pyr   r      s        C CJ 	Dr   r   r   N),ray.llm._internal.serve.core.ingress.ingressr   _OpenAiIngressr   ray.util.annotationsr   __all__r   r   r   <module>r      s           + * * * * * WG	 G	 G	 G	 G	N G	 G	 G	T 2
3r   