
    &`i4              
       &   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ  e j        e          ZdZde de!de"de#de f
dZ$ ed           G d de                      ZdS )    N)partial)AnyCallableDictOptionalUnion)
Checkpoint)TRAIN_DATASET_KEY)RayTrainReportCallback)LightGBMConfig)LightGBMTrainer)
GenDataset)_log_deprecation_warning)	PublicAPIa\  Passing in `lightgbm.train` kwargs such as `params`, `num_boost_round`, `label_column`, etc. to `LightGBMTrainer` is deprecated in favor of the new API which accepts a `train_loop_per_worker` argument, similar to the other DataParallelTrainer APIs (ex: TorchTrainer). See this issue for more context: https://github.com/ray-project/ray/issues/50042configlabel_columnnum_boost_rounddataset_keyslightgbm_train_kwargsc           
      4   t           j                                        }d }|}|rNt          j        |          }|                                }||z
  }t                              d| d|d           t           j                            t                    }	|	
                                                                }
d |D             }d |                                D             }|
                    |d          |
|         }}t          j        ||          }|g}t          g}|                                D ]b\  }}|                    |d          ||         }}|                    t          j        ||                     |                    |           c|                     t           j        j                                                   |                     d	d
           |                     dd           t          j        d| |||||d| d S )Nz7Model loaded from checkpoint will train for additional zY iterations (trees) in order to achieve the target number of iterations (num_boost_round=z).c                 d    i | ]-}|t           k    |t          j                            |          .S  )r
   raytrainget_dataset_shard).0ks     w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_trainer.py
<dictcomp>z1_lightgbm_train_fn_per_worker.<locals>.<dictcomp>6   s@       !!! 	
39&&q))!!!    c                 b    i | ],\  }}||                                                                 -S r   )materialize	to_pandas)r   r   ds      r   r   z1_lightgbm_train_fn_per_worker.<locals>.<dictcomp>;   s2    QQQ41a1==??,,..QQQr       )axis)labeltree_learnerdata_parallelpre_partitionT)params	train_setr   
valid_setsvalid_names
init_modelr   )r   r   get_checkpointr   	get_modelcurrent_iterationloggerinfor   r
   r"   r#   itemsdroplightgbmDatasetappendupdateget_network_params
setdefault)r   r   r   r   r   
checkpointstarting_modelremaining_itersstarting_itertrain_ds_itertrain_dfeval_ds_iterseval_dfstrain_Xtrain_yr,   r-   r.   	eval_nameeval_dfeval_Xeval_ys                         r   _lightgbm_train_fn_per_workerrK      sU    ))++JN%O 	
/9*EE&88::)M9%)% %  % % %	
 	
 	
 I//0ABBM((**4466H   M
 RQ=;N;N;P;PQQQH}}\}::H\<RWG 888I J$%K&nn.. & &	7 l;;W\=R(*6@@@AAA9%%%% MM#)$7799:::
no666
ot,,,N '!       r    beta)	stabilityc                   X    e Zd ZdZdZdZ	 ddddddddddddddeeeg df         ee	gdf         f                  dee	         dee
         deej        j                 d	eej        j                 d
ee	eef                  deej        j                 dee         dee	eef                  dee         dee	eef                  dee         f fdZde	d	eej        j                 d
ee	eef                  dee         dee         dee	gdf         fdZededej        fd            Z xZS )r   a  A Trainer for distributed data-parallel LightGBM training.

    Example
    -------

    .. testcode::
        :skipif: True

        import lightgbm

        import ray.data
        import ray.train
        from ray.train.lightgbm import RayTrainReportCallback, LightGBMTrainer

        def train_fn_per_worker(config: dict):
            # (Optional) Add logic to resume training state from a checkpoint.
            # ray.train.get_checkpoint()

            # 1. Get the dataset shard for the worker and convert to a `lightgbm.Dataset`
            train_ds_iter, eval_ds_iter = (
                ray.train.get_dataset_shard("train"),
                ray.train.get_dataset_shard("validation"),
            )
            train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize()
            train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas()
            train_X, train_y = train_df.drop("y", axis=1), train_df["y"]
            eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"]
            dtrain = lightgbm.Dataset(train_X, label=train_y)
            deval = lightgbm.Dataset(eval_X, label=eval_y)

            params = {
                "objective": "regression",
                "metric": "l2",
                "learning_rate": 1e-4,
                "subsample": 0.5,
                "max_depth": 2,
                # Adding the line below is the only change needed
                # for your `lgb.train` call!
                **ray.train.lightgbm.get_network_params(),
            }

            # 2. Do distributed data-parallel training.
            # Ray Train sets up the necessary coordinator processes and
            # environment variables for your workers to communicate with each other.
            bst = lightgbm.train(
                params,
                train_set=dtrain,
                valid_sets=[deval],
                valid_names=["validation"],
                num_boost_round=10,
                callbacks=[RayTrainReportCallback()],
            )

        train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
        eval_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(16)])
        trainer = LightGBMTrainer(
            train_fn_per_worker,
            datasets={"train": train_ds, "validation": eval_ds},
            scaling_config=ray.train.ScalingConfig(num_workers=4),
        )
        result = trainer.fit()
        booster = RayTrainReportCallback.get_model(result.checkpoint)

    Args:
        train_loop_per_worker: The training function to execute on each worker.
            This function can either take in zero arguments or a single ``Dict``
            argument which is set by defining ``train_loop_config``.
            Within this function you can use any of the
            :ref:`Ray Train Loop utilities <train-loop-api>`.
        train_loop_config: A configuration ``Dict`` to pass in as an argument to
            ``train_loop_per_worker``.
            This is typically used for specifying hyperparameters.
        lightgbm_config: The configuration for setting up the distributed lightgbm
            backend. Defaults to using the "rabit" backend.
            See :class:`~ray.train.lightgbm.LightGBMConfig` for more info.
        datasets: The Ray Datasets to use for training and validation.
        dataset_config: The configuration for ingesting the input ``datasets``.
            By default, all the Ray Datasets are split equally across workers.
            See :class:`~ray.train.DataConfig` for more details.
        scaling_config: The configuration for how to scale data parallel training.
            ``num_workers`` determines how many Python processes are used for training,
            and ``use_gpu`` determines whether or not each process should use GPUs.
            See :class:`~ray.train.ScalingConfig` for more info.
        run_config: The configuration for the execution of the training run.
            See :class:`~ray.train.RunConfig` for more info.
        resume_from_checkpoint: A checkpoint to resume training from.
            This checkpoint can be accessed from within ``train_loop_per_worker``
            by calling ``ray.train.get_checkpoint()``.
        metadata: Dict that should be made available via
            `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
            for checkpoints saved from this Trainer. Must be JSON-serializable.
        label_column: [Deprecated] Name of the label column. A column with this name
            must be present in the training dataset.
        params: [Deprecated] LightGBM training parameters.
            Refer to `LightGBM documentation <https://lightgbm.readthedocs.io/>`_
            for a list of possible parameters.
        num_boost_round: [Deprecated] Target number of boosting iterations (trees in the model).
            Note that unlike in ``lightgbm.train``, this is the target number
            of trees, meaning that if you set ``num_boost_round=10`` and pass a model
            that has already been trained for 5 iterations, it will be trained for 5
            iterations more, instead of 10 more.
        **train_kwargs: [Deprecated] Additional kwargs passed to ``lightgbm.train()`` function.
    TN)train_loop_configlightgbm_configscaling_config
run_configdatasetsdataset_configresume_from_checkpointmetadatar   r+   r   train_loop_per_workerrO   rP   rQ   rR   rS   rT   rU   rV   r   r+   r   c                   |d u }|r|                      |||
||          }|pi }n.|r,t          d|                                 dt                      t	          t
          |                               |||||||||		  	         d S )N)r   rR   r   r   rS   zPPassing `lightgbm.train` kwargs to `LightGBMTrainer` is deprecated. Got kwargs: z^
In your training function, you can call `lightgbm.train(**kwargs)` with arbitrary arguments. )	rW   rO   rP   rQ   rR   rS   rT   rU   rV   )_get_legacy_train_fn_per_workerr   keys+LEGACY_LIGHTGBM_TRAINER_DEPRECATION_MESSAGEsuperr   __init__)selfrW   rO   rP   rQ   rR   rS   rT   rU   rV   r   r+   r   train_kwargs
legacy_api	__class__s                  r   r]   zLightGBMTrainer.__init__   s    * +d2
 	$($H$H&2%) /! %I % %! !'" 	$A+0022A A ?	A A   	ot$$--"7/+)!)#9 	. 
	
 
	
 
	
 
	
 
	
r    r   returnc           	      P   |pi }|                     t                    s9t          dt           dt          |                                                     |st          d          |pd}t          t                     |                     dg           }t          d |D                       }i }|r&|j        j	        }	|j        j
        }
|	|d<   |
|
nd	|d
<   |s|                    t          di |           ||d<   t          t          ||t          |          |          }|S )z=Get the training function for the legacy LightGBMTrainer API.z`datasets` must be provided for the LightGBMTrainer API if `train_loop_per_worker` is not provided. This dict must contain the training dataset under the key: 'z'. Got keys: z`label_column` must be provided for the LightGBMTrainer API if `train_loop_per_worker` is not provided. This is the column name of the label in the dataset.
   	callbacksc              3   @   K   | ]}t          |t                    V  d S N)
isinstancer   )r   callbacks     r   	<genexpr>zBLightGBMTrainer._get_legacy_train_fn_per_worker.<locals>.<genexpr>  s>       %
 %
=EJx!788%
 %
 %
 %
 %
 %
r    	frequencyNTcheckpoint_at_end)r   r   r   r   r   )getr
   
ValueErrorlistrZ   r   r[   anycheckpoint_configcheckpoint_frequencyrl   r9   r   r   rK   set)r^   r   rR   rS   r   r   re   user_supplied_callbackcallback_kwargsrr   rl   train_fn_per_workers               r   rY   z/LightGBMTrainer._get_legacy_train_fn_per_worker   s    >r||-.. 	5 +5 5 "(--//22	5 5    	G   */R !LMMM *--k2>>	!$ %
 %
IR%
 %
 %
 "
 "
  	#-#?#T  * < N+?OK( &7%B!! /0 & 	H3FFoFFGGG-6k*%)%+X"7
 
 
 #"r    r=   c                 *    t          j        |          S )z6Retrieve the LightGBM model stored in this checkpoint.)r   r1   )clsr=   s     r   r1   zLightGBMTrainer.get_model6  s     &/
;;;r    rg   )__name__
__module____qualname____doc___handles_checkpoint_freq_handles_checkpoint_at_endr   r   r   r   r   r   r   ScalingConfig	RunConfigstrr   
DataConfigr	   r   intr]   rY   classmethodr7   Boosterr1   __classcell__)ra   s   @r   r   r   Z   s7       f fP  $!% 	2
 -148<@48489=7;-1&*+/)-#2
 2
 2
'(2t8$hvt|&<<= 
2
 $D>2
 ".12
 !!892
 SY012
 4Z012
 !!562
 !) 42
 4S>*2
 sm2
  c3h(!2
" "##2
 2
 2
 2
 2
 2
h9##9# SY019# 4Z01	9#
 sm9# "#9# 
4&$,	9# 9# 9# 9#v << 
	< < < [< < < < <r    r   )%logging	functoolsr   typingr   r   r   r   r   r7   r   	ray.trainr	   ray.train.constantsr
   "ray.train.lightgbm._lightgbm_utilsr   ray.train.lightgbm.configr   ray.train.lightgbm.v2r   SimpleLightGBMTrainerray.train.trainerr   ray.train.utilsr   ray.util.annotationsr   	getLoggerry   r3   r[   dictr   r   rs   rK   r   r    r   <module>r      s          7 7 7 7 7 7 7 7 7 7 7 7 7 7  



             1 1 1 1 1 1 E E E E E E 4 4 4 4 4 4 J J J J J J ( ( ( ( ( ( 4 4 4 4 4 4 * * * * * *		8	$	$6 ,999 9 	9
  9 9 9 9x Va< a< a< a< a<+ a< a< a< a< a<r    