
    &`i`a              
       z   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlZd dlmZ d dlmZ  ej        e          ZdZdZdZd	Zd
ZdZddiZdZdZdZdZdZ dZ!dZ"dZ#dZ$dZ%dZ&dZ'dZ(dZ)dZ*dZ+de,de,de,de
e-         fd Z.h d!h d"h d#/                     e.d$d$d%                    h d#/                     e.d%d%d&                    h d'h d'dZ0d(e-de	e-         fd)Z1d*e-fd+Z2d*e-de,fd,Z3d*e-de,fd-Z4d.e-d*e-de	e-         fd/Z5d0 Z6d.e-d1e-de,fd2Z7d.e-d*e-de	e-         fd3Z8 G d4 d5e          Z9dS )6    N)	lru_cache)DictListOptionalSetTuple)AcceleratorManager) PlacementGroupSchedulingStrategy)            TPU_ACCELERATOR_TYPETPU_TOPOLOGYTPU_WORKER_IDTPU_NAMEzGhttp://metadata.google.internal/computeMetadata/v1/instance/attributes/zMetadata-FlavorGooglezaccelerator-typeztpu-envzinstance-idzagent-worker-numberTPU_VISIBLE_CHIPS(RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPSTPU_CHIPS_PER_HOST_BOUNDSz1,1,1z1,2,1TPU_HOST_BOUNDSr   r   )	v5litepodv6e)v2v3v4v5pr   r   max_xmax_ymax_zreturnc           
      "   t                      }t          t          | dz   t                    D ]a}t          t          |dz   t                    D ]@}t          t          |dz   t                    D ]}|                    | d| d|             Ab|S )zvReturns a set of larger 3D TPU topologies given the max x,y,z value. Using DEFAULT_TPU_NUM_CHIPS_PER_HOST as incrementr   x)setrangeDEFAULT_TPU_NUM_CHIPS_PER_HOSTadd)r   r   r    
topologiesr#   yzs          q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/_private/accelerators/tpu.py_get_larger_3d_topologiesr,   D   s    J&	3Q  / / *EAI7U
 
 	/ 	/A .	.  / /
 !~~a~~!~~..../	/     >   4x44x88x88x1616x16>   r.   r/   r0   r1   r2   16x3232x32>   2x2x12x2x22x2x42x4x4         >   2x8r.   r/   r0   r1   r2   keyc                 b   	 t          j        t          j                            t
          |           t                    }|j        dk    r|j        r|j        S t          j
        d|j         d|j                    n1# t           j        $ r}t          j
        d|           Y d}~nd}~ww xY wdS )zPoll and get TPU metadata.)headers   z2Unable to poll TPU GCE Metadata. Got status code: z and content: z'Unable to poll the TPU GCE Metadata: %sN)requestsgetospathjoinGCE_TPU_ACCELERATOR_ENDPOINTGCE_TPU_HEADERSstatus_codetextloggingdebugRequestException)r=   accelerator_type_requestes      r+   _get_tpu_metadatarO   i   s    D#+<GLL5s;;#$
 $
 $
 
 %0C77(- 8 ,00M< 8 D< <49< <   
 $ D D D?CCCCCCCCD4s   AA> $A> >B,B''B,accelerator_typec                 n    |                      t                    st          d|  dt                     d S )NzInvalid accelerator type: z. Must start with one of: )
startswithVALID_TPU_TYPES
ValueErrorrP   s    r+   _accelerator_type_checkrV      sI    &&77 
f)9ffUdff
 
 	

 
r-   c                 f    t          |            |                     t                    rdS t          S )Nr   )rV   rR   SINGLE_HOST_8_CHIPS_TPU_TYPESr&   rU   s    r+   "get_num_tpu_visible_chips_per_hostrY      s3    ,---""#@AA q))r-   c                 f    t          |            |                     t                    rdS t          S )Nr   )rV   rR   SINGLE_CORE_TPU_TYPESDEFAULT_TPU_NUM_CORES_PER_CHIPrU   s    r+   get_tpu_cores_per_chipr]      s3    ,---""#899 q))r-   topologyc                 `   | r|sdS 	 d}|                                                                                      d          D ]}|t          |          z  }|                                                    dd          }| d| S # t
          $ r}t          d|  d| d	          |d}~ww xY w)
zGInfer the TPU pod type (e.g. v4-32) from topology and accelerator type.Nr   r#   ztpu- -z(Failed to infer pod type from topology 'z' and type '')striplowersplitintreplace	ExceptionrT   )r^   rP   	num_chipsvalue
generationrN   s         r+    infer_tpu_pod_type_from_topologyrl      s      + t
	^^%%++--33C88 	$ 	$EU#II%++--55fbAA
**y***   -x - -)- - -
 
 	s   A>B 
B-B((B-c                     t          j        d          d             }|                    t          | d                                                    }t          j        |          S )Nr   )num_cpusc                  4    t                                           S N)TPUAcceleratorManagerget_current_node_tpu_name r-   r+   _get_tpu_slice_namez9fetch_tpu_slice_name_from_pg.<locals>._get_tpu_slice_name   s    $>>@@@r-   )placement_groupplacement_group_bundle_index)scheduling_strategy)rayremoteoptionsr
   rB   )pgrt   tpu_name_refs      r+   fetch_tpu_slice_name_from_pgr}      s    ZA A A '..<Q
 
 
 /   fhh	  7<   r-   accelerator_versionc                    t           }d}|                                                                                     d          D ]}|t	          |          z  }|dk    r/|                                                                t
          v r|S |S )a  Get the number of chips per host (aka VMs) based on topology and accelerator version.
    The current rule is as follows:
        Default chips per host is 4.
        If accelerator_version is v5e or v6e AND topology product <= 8, the chips per host will just be the proudct. i.e. 1, 4, or 8
        If accelerator_version is v5e or v6e AND topology product > 8, the chips per host will be 4
        If accelerator_version is v5p or other versions, the chips per host will be 4

    Args:
        topology: The TPU topology string (e.g. "2x2x2").
        accelerator_version: The accelerator version of the node (e.g. "V4", "v4").

    Returns:
        A int representing the number of chips per host (aka VM)
    r   r#   r   )r&   rc   rd   re   rf   rX   )r^   r~   chips_per_hosttotal_chipsrj   s        r+   get_chips_per_hostr      s     4NK!!''))//44 " "s5zz! 	q%%''--//3PPPr-   c                    t          | |          }|dS d|d}t          j                            d| ddig|g          }t                              d           d	}t          j        |                                g|
          \  }}|s5t          d	                    ||t          j
                                        t          |          }|t          d          |S )a  Reserves a TPU slice using its head resource and returns the slice name.
    This enables gang scheduling of training workers with multi-host TPUs.
    This is used by JaxTrainer with TPUs in Ray Train.

    Args:
        topology: The TPU topology string (e.g. "2x2x2").
        accelerator_type: The accelerator type of the node (e.g. "TPU-V4").

    Returns:
        A string representing a unique TPU slice name.
    N0)zray.io/tpu-worker-idzray.io/tpu-pod-typeTPU--headr   )bundlesbundle_label_selectorz)Waiting to reserve multi-host slice head.d   )timeoutzFailed to reserve TPU head for slice with shape: {}. Ensure your cluster has sufficient resources. Requesting TPU head node with labels: {}. Current resources: {}zFailed to retrieve TPU slice name after reserving head placement group. Ensure that TPU slice metadata is available and correctly configured on multi-host nodes.)rl   rx   utilru   loggerrK   waitreadyTimeoutErrorformatavailable_resourcesr}   RuntimeError)	r^   rP   pod_typehead_label_selectorhead_placement_groupr   r   _
slice_names	            r+   reserve_tpu_slicer      s(    0:JKKHt !$'  833((((!,-23 4  
 LL<===Gx-33556HHHHE1 
??Ev-s/F/H/H@ @
 
 	
 ..BCCJh
 
 	
 r-   c                      e Zd ZdZedefd            Zedefd            Zedee	e                  fd            Z
e e            defd                        Zededefd            Zed	ed
edefd            Zededeeee         f         fd            Zede	e         ddfd            Zedee         fd            Zedee         fd            Zedee         fd            Zedee         fd            Zedee         fd            Zedee         fd            Zedeeeef                  fd            Zedeeef         fd            ZdS )rq   zGoogle TPU accelerators.r!   c                      dS )NTPUrs   rs   r-   r+   get_resource_namez'TPUAcceleratorManager.get_resource_name  s    ur-   c                      t           S rp   )TPU_VISIBLE_CHIPS_ENV_VARrs   r-   r+   #get_visible_accelerator_ids_env_varz9TPUAcceleratorManager.get_visible_accelerator_ids_env_var  s    ((r-   c                      t           j                            t                                          d           } | d S | dk    rg S t          |                     d                    S )Nr`   ,)rC   environrB   rq   r   listre   )tpu_visible_chipss    r+   +get_current_process_visible_accelerator_idszATPUAcceleratorManager.get_current_process_visible_accelerator_ids  sc    JNN!EEGG
 
 $4""I%++C00111r-   c                     t          j         d          } | rt          |           S 	 t          j        d          }d |D             }t          |          S # t          $ r&}t
                              d|           Y d}~dS d}~ww xY w)a  Attempt to detect the number of TPUs on this machine.

        TPU chips are represented as devices within `/dev/`, either as
        `/dev/accel*` or `/dev/vfio/*`.

        Returns:
            The number of TPUs if any were detected, otherwise 0.
        z/dev/accel*z	/dev/vfioc                 T    g | ]%}|                                 t          |          &S rs   )isdigitrf   ).0entrys     r+   
<listcomp>zKTPUAcceleratorManager.get_current_node_num_accelerators.<locals>.<listcomp>5  s+    WWWeu}}Ws5zzWWWr-   z#Failed to detect number of TPUs: %sNr   )globlenrC   listdirFileNotFoundErrorr   rK   )accel_filesvfio_entriesnumeric_entriesrN   s       r+   !get_current_node_num_acceleratorsz7TPUAcceleratorManager.get_current_node_num_accelerators$  s     i.. 	${###	:k22LWW|WWWO'''  	 	 	LL>BBB11111	s   .A 
B BBtpu_accelerator_typec                 \    t          j        d          }|                    |           sdS dS )a  Check whether the tpu accelerator_type is formatted correctly.

        The accelerator_type field follows a form of v{generation}-{cores/chips}.

        See the following for more information:
        https://cloud.google.com/sdk/gcloud/reference/compute/tpus/tpu-vm/accelerator-types/describe

        Args:
            tpu_accelerator_type: The string representation of the accelerator type
                to be checked for validity.

        Returns:
            True if it's valid, false otherwise.
        z^v\d+[a-zA-Z]*-\d+$FT)recompilematch)r   expected_patterns     r+   is_valid_tpu_accelerator_typez3TPUAcceleratorManager.is_valid_tpu_accelerator_type;  s6      :&<==%%&:;; 	5tr-   tpu_accelerator_versiontpu_topologyc                 $   |                                                                                      d          d         }|                                t          vs3|                                                                 t          |         vrdS dS )a*  Check whether the tpu topology is valid.

        The accelerator_type field follows a form of v{generation}.
        The accelerator_topology field follows either the form {A}x{B} or {A}x{B}x{C} depending on the v{generation}

        Args:
            tpu_accelerator_version: The string representation of the accelerator version. (e.g. v6e, V5P)
            tpu_topology: The string representation of the accelerator topology
                to be checked for validity

        Returns:
            True if it's valid topology, false othrwise
        ra   r   FT)rc   rd   re   VALID_TPU_TOPOLOGY)r   r   tpu_version_formatteds      r+   !is_valid_tpu_accelerator_topologyz7TPUAcceleratorManager.is_valid_tpu_accelerator_topologyP  s    " !8 = = ? ? E E G G M Mc R RST U!''))1CCC!!##))++%&;<= = 5tr-   quantityc                 6    | t           vrdd|  dt            fS dS )NFz)The number of requested 'TPU' was set to zA which is not a supported chip configuration. Supported configs: )TN)TPU_VALID_CHIP_OPTIONS)r   s    r+   "validate_resource_request_quantityz8TPUAcceleratorManager.validate_resource_request_quantityj  sB     111,H , ,), ,   <r-   visible_tpu_chipsNc                    t           j                            t                    rdS t	          |           }t
                                          }||k    rLt           j                            t          d           t           j                            t          d           dS d
                    d | D                       t           j        t
                                          <   |dk    r4t          t           j        t          <   t          t           j        t          <   dS |dk    r4t          t           j        t          <   t          t           j        t          <   dS dS )a'  Set TPU environment variables based on the provided visible_tpu_chips.

        To access a subset of the TPU visible chips, we must use a combination of
        environment variables that tells the compiler (via ML framework) the:
        - Visible chips
        - The physical bounds of chips per host
        - The host bounds within the context of a TPU pod.

        See: https://github.com/google/jax/issues/14977 for an example/more details.

        Args:
            visible_tpu_chips (List[str]): List of int representing TPU chips.
        Nr   c                 ,    g | ]}t          |          S rs   )str)r   is     r+   r   zUTPUAcceleratorManager.set_current_process_visible_accelerator_ids.<locals>.<listcomp>  s    888c!ff888r-   r   r   )rC   r   rB   NOSET_TPU_VISIBLE_CHIPS_ENV_VARr   rq   r   pop!TPU_CHIPS_PER_HOST_BOUNDS_ENV_VARTPU_HOST_BOUNDS_ENV_VARrE   r   'TPU_CHIPS_PER_HOST_BOUNDS_1_CHIP_CONFIGTPU_SINGLE_HOST_BOUNDS'TPU_CHIPS_PER_HOST_BOUNDS_2_CHIP_CONFIG)r   num_visible_tpu_chipsnum_accelerators_on_nodes      r+   +set_current_process_visible_accelerator_idszATPUAcceleratorManager.set_current_process_visible_accelerator_idsx  s4   " :>>9:: 	F #$5 6 6!CCEE 	! !$<<<JNN<dCCCJNN2D999F HH88&788899 	
!EEGG	
 !A%% 8 J1 3IBJ.///"a'' 8 J1 3IBJ.///	 ('r-   c                      t          j        t          d          } | st          t                    } | rt
                              |           r| S t          j        d           dS )aq  Get the TPU pod type of the current node if applicable.

        Individual TPU VMs within a TPU pod must know what type
        of pod it is a part of. This is necessary for the
        ML framework to work properly.

        The logic is different if the TPU was provisioned via:
        ```
        gcloud tpus tpu-vm create ...
        ```
        (i.e. a GCE VM), vs through GKE:
        - GCE VMs will always have a metadata server to poll this info
        - GKE VMS will have environment variables preset.

        Returns:
            A string representing the current TPU pod type, e.g.
            v4-16.

        r`   r=   )r   z'Failed to get a valid accelerator type.N)	rC   getenv GKE_TPU_ACCELERATOR_TYPE_ENV_VARrO   GCE_TPU_ACCELERATOR_KEYrq   r   rJ   rK   rU   s    r+   get_current_node_tpu_pod_typez3TPUAcceleratorManager.get_current_node_tpu_pod_type  sy    , 9%ErJJ 	N05LMMM 	$ 5 S S!1 !T !
 !
 	$ $#?@@@tr-   c                      	 t          j        t          d          } | st          t                    } | S # t
          $ r }t          j        d|           Y d}~dS d}~ww xY w)a7  Return the name of the TPU pod that this worker node is a part of.

        For instance, if the TPU was created with name "my-tpu", this function
        will return "my-tpu".

        If created through the Ray cluster launcher, the
        name will typically be something like "ray-my-tpu-cluster-worker-aa946781-tpu".

        In case the TPU was created through KubeRay, we currently expect that the
        environment variable TPU_NAME is set per TPU pod slice, in which case
        this function will return the value of that environment variable.

        Nr   zCould not get TPU name: %s)rC   r   GKE_TPU_NAME_ENV_VARrO   GCE_TPU_INSTANCE_ID_KEYrT   rJ   rK   )tpu_namerN   s     r+   rr   z/TPUAcceleratorManager.get_current_node_tpu_name  sw    		y!5t<<H J,1HIIIO 	 	 	M6:::44444	s   25 
AAAc                      	 t          j        t          d          } | st          t                    } | rt          |           S dS # t          $ r }t          j        d|           Y d}~dS d}~ww xY w)z'Return the worker index of the TPU pod.Nr   zCould not get TPU worker id: %s)	rC   r   GKE_TPU_WORKER_ID_ENV_VARrO   GCE_TPU_WORKER_ID_KEYrf   rT   rJ   rK   )	worker_idrN   s     r+   get_current_node_tpu_worker_idz4TPUAcceleratorManager.get_current_node_tpu_worker_id  s    		";TBBI I-2GHHH	 9~~%t 	 	 	M;Q???44444	s   AA 
A0A++A0c                  D   t                                           } t                                           }t          |           }||z  }| rC|dk    r=t	          |                     d          d                   }||z  }||z  dk    r|dz  }|S t          j        d           dS )z0Return the total number of workers in a TPU pod.r   ra   r   z%Could not get num workers in TPU pod.N)rq   r   r   r]   rf   re   rJ   rK   )tpu_pod_typer   cores_per_chipcores_per_host	num_coresnum_workerss         r+   "get_num_workers_in_current_tpu_podz8TPUAcceleratorManager.get_num_workers_in_current_tpu_pod  s     -JJLL.PPRR/=='.8 		NQ..L..s33A677I#~5K>)Q..q MABBB4r-   c                  <   	 t           j                            t                    x} r| S t	          t
                    }|r,t          j        d|          } | r|                     d          S d S d S # t          $ r }t          j        d|           Y d }~d S d }~ww xY w)Nr   zTOPOLOGY:\s*'([^']+)'r   zCould not get TPU topology: %s)rC   r   rB   GKE_TPU_TOPOLOGY_ENV_VARrO   GCE_TPU_ENV_KEYr   searchgrouprT   rJ   rK   )r^   tpu_envrN   s      r+   get_current_node_tpu_topologyz3TPUAcceleratorManager.get_current_node_tpu_topology  s    	:>>*BCCCx  'O<<<G -9%=wGG -#>>!,,,- -- - 	 	 	M:A>>>44444	s   'A1 AA1 1
B;BBc                      dt           dt          t                    fd} d}t                                          }|+ | |          }|t                              d|            |t          j        d           |S )a  Attempt to detect the TPU accelerator type.

        The output of this function will return the "ray accelerator type"
        resource (e.g. TPU-V4) that indicates the TPU version.

        We also expect that our TPU nodes contain a "TPU pod type"
        resource, which indicates information about the topology of
        the TPU pod slice.

        We expect that the "TPU pod type" resource to be used when
        running multi host workers, i.e. when TPU units are pod slices.

        We expect that the "ray accelerator type" resource to be used when
        running single host workers, i.e. when TPU units are single hosts.

        Returns:
            A string representing the TPU accelerator type,
            e.g. "TPU-V2", "TPU-V3", "TPU-V4" if applicable, else None.

        r   r!   c                 |    dt          |                     d          d                                                   z   S )Nr   ra   r   )r   re   upperr   s    r+   $tpu_pod_type_to_ray_accelerator_typezeTPUAcceleratorManager.get_current_node_accelerator_type.<locals>.tpu_pod_type_to_ray_accelerator_type(  s6     C 2 23 7 7 : @ @ B BCCCCr-   Nr   zLWhile trying to autodetect a TPU type, received malformed accelerator_type: zFailed to auto-detect TPU type.)r   r   rq   r   r   inforJ   )r   ray_accelerator_typer   s      r+   !get_current_node_accelerator_typez7TPUAcceleratorManager.get_current_node_accelerator_type  s    .	D	Dc]	D 	D 	D 	D
  $,JJLL##G#G)$ $ $  $+K<HK K  
  'L:;;;##r-   c                     i } t                                           }t                                           }t                                           }|r||rd| d}d| |<   |dk    rd| |<   nt	          j        d|||           | r| S dS )a  Get additional resources required for TPU nodes.

        This will populate the TPU pod type and the TPU name which
        is used for TPU pod execution.

        When running workloads on a TPU pod, we need a way to run
        the same binary on every worker in the TPU pod.

        See https://jax.readthedocs.io/en/latest/multi_process.html
        for more information.

        To do this in ray, we take advantage of custom resources. We
        mark worker 0 of the TPU pod as a "coordinator" that identifies
        the other workers in the TPU pod. We therefore need:
        - worker 0 to be targetable.
        - all workers in the TPU pod to have a unique identifier consistent
        within a TPU pod.

        So assuming we want to run the following workload:

        @ray.remote
        def my_jax_fn():
            import jax
            return jax.device_count()

        We could broadcast this on a TPU pod (e.g. a v4-16) as follows:

        @ray.remote(resources={"TPU-v4-16-head"})
        def run_jax_fn(executable):
            # Note this will execute on worker 0
            tpu_name = ray.util.tpu.get_tpu_pod_name()
            num_workers = ray.util.tpu.get_tpu_num_workers()
            tpu_executable = executable.options(resources={"TPU": 4, tpu_name: 1})
            return [tpu_executable.remote() for _ in range(num_workers)]

        Returns:
            A dictionary representing additional resources that may be
            necessary for a particular accelerator type.

        Nr   r   r   r   zSFailed to configure TPU pod. Got: tpu_name: %s, worker_id: %s, accelerator_type: %s)rq   rr   r   r   rJ   r   )	resourcesr   r   r   pod_head_resource_names        r+   %get_current_node_additional_resourcesz;TPUAcceleratorManager.get_current_node_additional_resources?  s    T 	(BBDD)HHJJ	,JJLL 		-,-%?L%?%?%?""#IhA~~45	01LD    	tr-   c                     i } t                                           }|r|| t          j        j        <   t                                           }|!t          |          | t          j        j        <   t                                           }|r|| t          j        j	        <   t           
                                }|r|| t          j        j        <   | S )a  Get default TPU-specific Ray node labels for the current node.

        For TPUs, these labels include:
        - ray.io/tpu-slice-name: the name of the TPU Pod or slice
        - ray.io/tpu-worker-id: the integer worker ID within the slice
        - ray.io/tpu-topology: the TPU topology (e.g. 4x4)
        - ray.io/tpu-pod-type: the TPU pod type (e.g. v4-8)

        Returns:
            A dictionary of TPU label keys and resolved values.
        )rq   rr   rx   _rayletRAY_NODE_TPU_SLICE_NAME_KEYr   r   RAY_NODE_TPU_WORKER_ID_KEYr   RAY_NODE_TPU_TOPOLOGY_KEYr   RAY_NODE_TPU_POD_TYPE_KEY)
tpu_labelsr   r   r   r   s        r+   #get_current_node_accelerator_labelsz9TPUAcceleratorManager.get_current_node_accelerator_labels  s     
(BBDD 	KBJJs{>?)HHJJ	 ADYJs{=>,JJLL 	M@LJs{<=(FFHH 	I@HJs{<=r-   )__name__
__module____qualname____doc__staticmethodr   r   r   r   r   r   r   rf   r   boolr   r   floatr   r   r   r   rr   r   r   r   r   r   r   r  rs   r-   r+   rq   rq     s       ""s    \ ) ) ) ) \) 2$s)9L 2 2 2 \2 Y[[s    [ \* C D    \( !$47	   \2   	tXc]"	#      \  (I9(I	(I (I (I \(IT 8C=    \@ x}    \2 HSM    \      \" 8C=    \ +$x} +$ +$ +$ \+$Z ?8De<L3M ? ? ? \?B c3h    \  r-   rq   ):r   rJ   rC   r   	functoolsr   typingr   r   r   r   r   rA   rx   %ray._private.accelerators.acceleratorr	   ray.util.scheduling_strategiesr
   	getLoggerr  r   r   r   r   r   r   rF   rG   r   r   r   r   r   r   r   r   r   r   r   r&   r\   rX   r[   rS   rf   r   r,   unionr   rO   rV   rY   r]   rl   r}   r   r   rq   rs   r-   r+   <module>r     sm     				 				       3 3 3 3 3 3 3 3 3 3 3 3 3 3  



 D D D D D D K K K K K K		8	$	$ & #9  ) + !  N  %h/, ' - / "L  %@ !*1 '*1 '+    "# !"  !5  -  @S  S SX    * 1
0
0
B
B
B
.
.
.
4
4!!"b"--   
 e%%b"b1122>>>888  "3 8C=    .
c 
 
 
 
* * * * * **S *S * * * *%(c]   &! ! ! 3 3    :333 c]3 3 3 3lU U U U U. U U U U Ur-   