
    Pi                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d d	l m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ d d
l0m1Z1m2Z2m3Z3  e1d          Z4 G d de          Z5 G d de5          Z6 G d de5          Z7 G d de5          Z8 G d de5          Z9dS )    N)Future)Path)AnyDictListOptionalProtocolUnion)	save_file)
async_saveFileSystemReaderFileSystemWriterloadsave)training)convert_weights)ADAPTER_CONFIG_FNAMEADAPTER_MODEL_FNAMEcheck_outdir_not_in_ckptdir
copy_filesget_adapter_checkpoint_pathget_model_checkpoint_pathget_recipe_checkpoint_path	ModelTypeRECIPE_STATE_DIRNAMEREPO_ID_FNAMEsafe_torch_loadSAFETENSOR_INDEX_FNAMESHARD_FNAMESUFFIXES_TO_NOT_COPYTORCH_INDEX_FNAME)
get_loggerget_world_size_and_ranklog_rank_zeroDEBUGc                   N    e Zd ZdZdeeef         fdZdeeef         ddfdZdS )_CheckpointerInterfacea,  
    Interface implemented by Checkpointers in torchtune.

    torchtune checkpointers are designed to be composable components which can be plugged
    into any training recipe. Each checkpointer supports a specific set of models and training
    scenarios making these easy to understand, debug and extend. For example, the
    ``FullModelCheckpointer``s are used for loading and saving all of the model weights.
    This checkpointer can be used for Full-Finetuning scenarios or PEFT where the output is a
    merged checkpoint. In case the current suite of checkpointers are inadequate,
    users are encouraged to implement their own and contribute back to torchtune.

    torchtune is also designed to be "state-dict invariant". This means the checkpointer
    ensures that the output checkpoint has the same format as the original checkpoint i.e.
    the output checkpoint has the same keys split across the same number of files as the original
    checkpoint. Being "state-dict invariant" allows users to seamlessly use torchtune checkpoints
    with their favorite post-training tools from the open-source ecosystem without writing
    torchtune-specific convertors. To be "state-dict invariant", the ``load_checkpoint`` and
    ``save_checkpoint`` methods make use of the weight convertors available in
    ``torchtune/models/<model_folder>``.

    torchtune Checkpointers support two checkpointing scenarios:
        * End-of-training Checkpointing. The model weights at the end of a completed training
            run are written out to file. The checkpointer ensures that the output checkpoint
            files have the same keys as the input checkpoint file used to begin training. The
            checkpointer also ensures that the keys are partitioned across the same number of
            files as the original checkpoint. This ensures that the original metadata files can
            be used as is, and the output checkpoint can be used with any tool that understands
            the original checkpoint format. This includes popular inference engines such as
            ``llama.cpp`` and ``gpt-fast``. The output state dict has the following format:
            {
                "key_1": weight
                ...
            }


        Mid-training Chekpointing. In addition to the model checkpoint files, we output an
            additional "recipe_state.pt" file for intermediate checkpoints. These are currently
            output at the end of each epoch, and contain information such as optimizer state,
            number of epochs completed etc which is needed to correctly resume a previously
            interrupted training run. The recipe is responsible for constructing the state dict
            with the information it needs. The checkpointer extracts the model state dict
            (key = "model") and writes everything else out to "recipe_state.pt". To prevent us
            from flooding ``output_dir`` with checkpoint files, the recipe state is overwritten
            at the end of each epoch. The output state dicts have the following formats:

            Model:
                {
                    "key_1": weight
                    ...
                }

            Recipe State:
                {
                    "optimizer": ...,
                    "epoch": ...,
                    ...
                }

    returnc                     d S N )selfkwargss     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/training/checkpointing/_checkpointer.pyload_checkpointz&_CheckpointerInterface.load_checkpointp           
state_dictNc                     d S r*   r+   )r,   r2   r-   s      r.   save_checkpointz&_CheckpointerInterface.save_checkpoints   r0   r1   )	__name__
__module____qualname____doc__r   strr   r/   r4   r+   r1   r.   r'   r'   3   sl        : :x4S>    $sCx. t      r1   r'   c                       e Zd ZdZ	 	 	 	 ddedee         dededee         d	ee         d
ededdfdZddede	ee
f         fdZ	 	 dde	ee
f         dedededdf
dZdS )FullModelTorchTuneCheckpointera  
    Checkpointer which reads and writes checkpoints in a format compatible with
    torchtune. No conversion of weights is required.

    Currently this supports reading a single checkpoint file only. This will likely change as
    we add support for larger models.

    Args:
        checkpoint_dir (str): Directory containing the checkpoint files
        checkpoint_files (List[str]): List of checkpoint files to load. Since the checkpointer takes care
            of sorting by file ID, the order in this list does not matter
        model_type (str): Model type of the model for which the checkpointer is being loaded, e.g. LLAMA3.
        output_dir (str): Directory to save the checkpoint files
        adapter_checkpoint (Optional[str]): Path to the adapter weights. If None,
            and `should_load_recipe_state=True`, then look for adapter_model.pt in output_dir/epoch_{largest_epoch}.
            Default is None.
        recipe_checkpoint (Optional[str]): Path to the recipe state checkpoint file. If None,
            and `should_load_recipe_state=True`, then look for recipe_state.pt in output_dir/RECIPE_STATE_DIRNAME.
            Default is None.
        resume_from_checkpoint (bool): If True, the checkpointer will load the additional checkpoint files corresponding to
            the recipe state from a previous run. Default is False. This flag is deprecated. Please use the
            should_load_recipe_state flag instead.
        should_load_recipe_state (bool): If True, the checkpointer will load the additional checkpoint files corresponding to
            the recipe state from a previous run. Default is False

    Raises:
        ValueError: If more than one checkpoint file is provided
    NFcheckpoint_dircheckpoint_files
model_type
output_diradapter_checkpointrecipe_checkpointresume_from_checkpointshould_load_recipe_stater(   c	                 4   t          |          dk    r t          dt          |           d          t          |          | _        || _        |r!|| _        t
                              d           t          |         | _        t          |          | _	        t          | j        | j	                   | j	                            dd           t          | j	        || j        d	          | _        t          | j	        || j        
          | _        t!          || j        | j	        | j        | j        d u          | _        | j        d         | _        | j        r>t
                              dd | j        D              d| j         d| j                    d S d S )N   ECurrently we only support reading from a single checkpoint file. Got  files instead.X*resume_from_checkpoint is deprecated. Please use the 'should_load_recipe_state' insteadckpt_dirout_dirTparentsexist_ok^epoch_(\d+)r?   r@   rC   patternr?   rA   rC   r=   r<   r?   rC   has_adapter_checkpointr   4Loading the recipe state using: 
	checkpoint_paths: c                 ,    g | ]}t          |          S r+   r9   .0paths     r.   
<listcomp>z;FullModelTorchTuneCheckpointer.__init__.<locals>.<listcomp>       )W)W)W#d)))W)W)Wr1   
	recipe_checkpoint: 
	adapter_checkpoint: len
ValueErrorr   _checkpoint_dir_should_load_recipe_stateloggerwarningr   _model_type_output_dirr   mkdirr   _adapter_checkpointr   _recipe_checkpointr   _checkpoint_paths_checkpoint_pathinfo	r,   r<   r=   r>   r?   r@   rA   rB   rC   s	            r.   __init__z'FullModelTorchTuneCheckpointer.__init__   s      A%%>+,,> > >  
  $N33)A&! 	-CD*NNj   %Z0
++#)43C	
 	
 	
 	
 	td;;; $?'1%)%C#	$
 $
 $
  #='/%)%C#
 #
 #
 ";-/'%)%C#'#;4#G"
 "
 "
 !% 6q 9) 	KKF)W)W@V)W)W)WF F*.*AF F ,0+CF F    	 	r1   Tweights_onlyc                    i }t          | j        |          |t          j        <   | j        r#t          | j                  }||t          j        <   | j        r+t          | j        d          }|                    |           |S )a  
        Load torchtune checkpoint from file. Currently only loading from a single file is supported.

        The output state_dict has the following format, with keys other than "model" only present if
        ``should_load_recipe_state`` is True:

        >>>     {
        >>>         "model": {
        >>>             "key_1": weight
        >>>             ...
        >>>         },
        >>>         "optimizer": {...},
        >>>         ...
        >>>     }

        Args:
            weights_only (bool): flag passed down to torch.load. We expose this, because quantized models
                cannot be loaded with weights_only=True

        Returns:
            Dict[str, Any]: state_dict from the input checkpoint
        )rp   Fmmap)	r   rl   r   	MODEL_KEYri   ADAPTER_KEYrc   rj   update)r,   rp   r2   adapter_state_dictrecipe_states        r.   r/   z.FullModelTorchTuneCheckpointer.load_checkpoint   s    . %'
)8!*
 *
 *

8%& # 	B!01I!J!J/AJx+,) 	,*4+BOOOLl+++r1   r2   epochintermediate_checkpointadapter_onlyc                    |st          j        d                    d          d                    d                    }t          j        | j        d| |                              d          }|j                            dd           t          j
        |t          j                 |           t                              dt          j                            |          d	z  d
d|            t          j        |v rt          j        | j        d| t&                                        d          }|j                            dd           t          j
        |t          j                 |           t                              dt          j                            |          d	z  d
d|            n|rt)          d          t+          | j        t          j        | j        d|           t.                     |r|                    t          j        d          }|                    t          j        d          }|                    t          j        d          }t          j        | j        t4          d          }|j                            dd           t          j
        ||           t                              dt          j                            |          d	z  d
d|            dS t                              d           |r*t                              d| j        j         d           dS t                              d           dS )a  
        Save torchtune checkpoint to file. If ``intermediate_checkpoint`` is True, an additional
        checkpoint file ``recipe_state.pt`` is created in ``_output_dir/RECIPE_STATE_DIRNAME``
        which contains the recipe state. The output state dicts have the following formats:

        >>> # Model
        >>> {
        >>>     "key_1": weight
        >>>     ...
        >>> }
        >>>
        >>> # Recipe state
        >>> {
        >>>     "optimizer": ...,
        >>>     "epoch": ...,
        >>>     ...
        >>> }

        Args:
            state_dict (Dict[str, Any]): State dict with model and (optionally) recipe state
            epoch (int): Current epoch number. This is added to the checkpoint file name to ensure
                we're not overwriting intermediate checkpoint files
            intermediate_checkpoint (bool): If True, save an additional checkpoint file with the
                recipe state
            adapter_only (bool): If True, only save the adapter weights. Default is False


        Raises:
            ValueError: if ``adapter_only`` is True and adapter checkpoint not found in state_dict.
        1   cpt_idx
num_shardsepoch_.binTrL   Model checkpoint of size    @.2f GiB saved to .ptAdapter checkpoint of size gAdapter checkpoint not found in state_dict. Please ensure that the state_dict contains adapter weights.ignore_suffixesNrecipe_state.ptRecipe checkpoint of size Saving final epoch checkpoint.Please note that you have set adapter_only=True, so only adapter weights will be saved.You need to merge the adapter weights into your base model for further use. See ".save_checkpoint for more details.The full model checkpoint, including all weights and configurations, has been saved successfully.You can now use this checkpoint for further training or inference.)r   formatzfillr   joinpathrg   with_suffixparentrh   torchr   r   rt   rd   rm   osrZ   getsizeru   r   ra   r   rb   r    popADAPTER_CONFIGr   	__class__r5   )r,   r2   ry   rz   r{   
shard_nameoutput_path_s           r.   r4   z.FullModelTorchTuneCheckpointer.save_checkpoint  s   N  	$+		!1  J - "25"2"2J k&!!  $$TD$AAAJz("45{CCCKK*7??;//'9D* *'* *   :--- "25"2"24G k%    $$TD$AAAJz("67EEEKK*7??;//'9D* *'* *     	y   	 M$*,<U,<,<==0	
 	
 	
 	
 # 	x1488Ax3T::Ax6==A- "68I K $$TD$AAAJz;///KK*7??;//'9D* *'* *     KK8999 
W>2W W W     Y    r1   NNFF)TFFr5   r6   r7   r8   r9   r   r   boolro   r   r   r/   intr4   r+   r1   r.   r;   r;   w   sN        F -1+/',).E EE s)E 	E
 E %SME $C=E !%E #'E 
E E E EN# #D #DcN # # # #R )."j jcNj j "&	j
 j 
j j j j j jr1   r;   c                       e Zd ZdZ	 	 	 	 	 ddedeee         eeef         f         deded	ee         d
ee         de	de	de	ddfdZ
deeef         fdZ	 	 ddeeef         dede	de	ddf
dZdS )FullModelHFCheckpointera	  
    Checkpointer which reads and writes checkpoints in HF's format. For LoRA models this includes
    saving checkpoints in a format that can be loaded into PEFT via e.g. ``from_pretrained``. Examples include
    the Llama-2-7b-hf model from the meta-llama repo (https://huggingface.co/meta-llama/Llama-2-7b-hf).

    Note:
        HF checkpoint names are usually ordered by ID (eg: 0001_of_0003, 0002_of_0003, etc.) To ensure         we read the files in the right order, we sort the checkpoint file names before reading.

    Note:
        Checkpoint conversion to and from HF's format requires access to model params which are         read directly from the ``config.json`` file. This helps ensure we either load the weights         correctly or error out in case of discrepancy between the HF checkpoint file and torchtune's         model implementations.

    Args:
        checkpoint_dir (str): Directory containing the checkpoint files
        checkpoint_files (Union[List[str], Dict[str, str]]): List of checkpoint files to load or a dictionary
            containing the keys keys ["filename_format", "max_filename"]. Since the checkpointer takes care
            of sorting by file ID, the order in this list does not matter.
        model_type (str): Model type of the model for which the checkpointer is being loaded, e.g. LLAMA3.
        output_dir (str): Directory to save the checkpoint files
        adapter_checkpoint (Optional[str]): Path to the adapter weights. If None,
            and `should_load_recipe_state=True`, then look for adapter_model.pt in output_dir/epoch_{largest_epoch}.
            Default is None.
        recipe_checkpoint (Optional[str]): Path to the recipe state checkpoint file. If None,
            and `should_load_recipe_state=True`, then look for recipe_state.pt in output_dir/RECIPE_STATE_DIRNAME.
            Default is None.
        resume_from_checkpoint (bool): If True, the checkpointer will load the additional checkpoint files corresponding to
            the receipe state from a previous run. Default is False. This flag is deprecated. Please use
            the should_load_recipe_state flag instead.
        safe_serialization (bool): If True, the checkpointer will save the checkpoint file using `safetensors`.
            Default is True.
        should_load_recipe_state (bool): If True, the checkpointer will load the additional checkpoint files corresponding to
            the receipe state from a previous run. Default is False
    NFTr<   r=   r>   r?   r@   rA   rB   safe_serializationrC   r(   c
                    |	| _         |r!|| _         t                              d           || _        t	          |          | _        t          |         | _        t	          |          | _        t          | j        | j                   | j        
                    dd           d | _        t          j        t	          j        | j        d                                                    | _        t	          j        | j        t"                                        d          }
d | _        |
                                rVt+          |
d          5 }t          j        |          }|                    d          | _        d d d            n# 1 swxY w Y   t1          | j        || j         d	
          | _        t5          | j        || j                   | _        t9          || j        | j        | j         | j        d u          | _        | j         r>t                              dd | j        D              d| j         d| j                    d S d S )NrH   rI   TrL   zconfig.json.jsonrrepo_idrO   rP   rR   rS   rU   c                 ,    g | ]}t          |          S r+   rW   rX   s     r.   r[   z4FullModelHFCheckpointer.__init__.<locals>.<listcomp>  r\   r1   r]   r^   )rc   rd   re   _safe_serializationr   rb   r   rf   rg   r   rh   _weight_mapjsonloadsr   	read_text_configr   r   r   existsopenr   getr   ri   r   rj   r   rk   rm   )r,   r<   r=   r>   r?   r@   rA   rB   r   rC   repo_id_path	json_filedatas                r.   ro   z FullModelHFCheckpointer.__init__  s    *B&! 	-CD*NNj   $6 #N33$Z0
++#)43C	
 	
 	
 	
 	td;;;
 ,0 zM$.>>HHJJ
 
 }T%9=IIUU
 
    	3lC(( 3Iy++#xx	223 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 $?'1%)%C#	$
 $
 $
  #='/%)%C#
 #
 #
 ";-/'%)%C#'#;4#G"
 "
 "
 ) 	KKF)W)W@V)W)W)WF F*.*AF F ,0+CF F    	 	s   
/FF	F	c                 	   i | _         i }i }t          | j                  D ]\  }}t          |          }|                                D ]N\  }}t          |t          j                  s t          dt          |           d          |dz   d| j         |<   O|
                    |           ~t          j                     | j        t          j        t          j        fv rmt#          t$          d           ddlm} | j        d	         }	| j        d
         }
| j        d         }|	|
k    rd\  }	}
} |||	|
|          |t,          j        <   n| j        t          j        k    rEddlm}  ||| j        d	         | j        d
         | j        d                   |t,          j        <   n~| j        t          j        k    rQddlm}  ||| j        d	         | j        d
         | j        d         | j        d                   |t,          j        <   n| j        t          j        k    rddlm } | j        !                    di           }| j        !                    di           } |||d	         |d
         |d         |!                    dd          |d         |!                    dd          |d         |d         |d         |!                    dd                    |t,          j        <   n<| j        t          j"        k    r ddl#m$}  ||          |t,          j        <   n| j        t          j%        k    r^ddl&m'}  ||| j        d	         | j        d
         | j        d         | j        !                    dd                    |t,          j        <   n| j        t          j(        k    rdd l)m*}  ||          |t,          j        <   n`tW          j,        || j        d	         | j        d
         | j        d         | j        !                    dd                    |t,          j        <   | j-        r#t          | j-                  }||t,          j.        <   | j/        r+t          | j0        d!"          }|
                    |           |S )#a  
        Load HF checkpoint from file.

        The keys and weights from across all checkpoint files are merged into a single state_dict.
        We preserve the "state_dict key" <-> "checkpoint file" mapping in weight_map so we can
        write the state dict correctly in ``save_checkpoint``.

        Before returning, the model state dict is converted to a torchtune-compatible format using
        the appropriate convert_weights function (depending on ``self._model_type``).

        Returns:
            state_dict (Dict[str, Any]): torchtune checkpoint state dict

        Raises:
            ValueError: If the values in the input state_dict are not Tensors
        z@Expected all values in the state dict to be torch.Tensor. Found z	 instead.rE   04zpConverting Phi weights from HF format.Note that conversion of adapter weights into PEFT format is not supported.)rd   msgr   )phi3_hf_to_tunenum_attention_headsnum_key_value_headshidden_size)NNN	num_headsnum_kv_headsdim)reward_hf_to_tune)qwen2_hf_to_tunetie_word_embeddingsr   r   r   r   )llama3_vision_hf_to_tunetext_configvision_confighead_dimN
vocab_sizecross_attention_layers
image_sizemax_num_tilessupported_aspect_ratios
r   r   r   r   r   r   encoder_dim	tile_size	num_tilesr   )clip_text_hf_to_tune)gemma2_hf_to_tuner   r   r   r   )t5_encoder_hf_to_tuneFrr   )1r   	enumeraterk   r   items
isinstancer   Tensorra   typerv   gccollectrf   r   	PHI3_MINIPHI4r$   rd   &torchtune.models.phi3._convert_weightsr   r   r   rt   REWARDtorchtune.rlhf.utilsr   QWEN2'torchtune.models.qwen2._convert_weightsr   LLAMA3_VISION1torchtune.models.llama3_2_vision._convert_weightsr   r   	CLIP_TEXT&torchtune.models.clip._convert_weightsr   GEMMA2(torchtune.models.gemma2._convert_weightsr   
T5_ENCODER$torchtune.models.t5._convert_weightsr   r   
hf_to_tuneri   ru   rc   rj   )r,   merged_state_dictconverted_state_dictr   cpt_pathr2   keyvaluer   r   r   r   r   r   r   r   r   r   r   r   rw   rx   s                         r.   r/   z'FullModelHFCheckpointer.load_checkpoint  sW   $  68
 DF "+4+A!B!B 	 	GX(22J(..00 	< 	<
U "%66 $8!%e8 8 8  
 ,3Q;(;(; %%$$Z000 JLLLL	 3Y^DDD]   
 ONNNNN%:;I<(=>L,}-C L((/?,	<7F!#)	8 8 8 !344 !111>>>>>>7H7H!,'<=!\*?@L/	8 8 8 !344 00PPPPPP7G7G!,'<=!\*?@L/$(L1F$G8 8 8 !344 !888      ,**="==K L,,_bAAM7O7O!%&;<()>?.$T::&|4'27OQU'V'V)-8'5'8(5(9(9-t) )8 8 8 !344 !444SSSSSS7K7K!8 8 !344 !111RRRRRR7H7H!,'<=!\*?@L/))*d;;8 8 8 !344 !555RRRRRR7L7L!8 8 !344 8G7Q!,'<=!\*?@L/))*d;;8 8 8 !34 # 	L!01I!J!J9K !56) 	6*4+BOOOL ''555##r1   r2   ry   rz   r{   c                    |s| j         t          j        t          j        fv r0ddlm}  ||t          j                           |t          j        <   n| j         t          j        k    rUddl	m
}  ||t          j                 | j        d         | j        d         | j        d                   |t          j        <   nT| j         t          j        k    raddlm}  ||t          j                 | j        d         | j        d         | j        d         | j        d	         
          |t          j        <   n| j         t          j        k    rddlm} | j                            di           }	| j                            di           }
 ||t          j                 |	d         |	d         |	d         |	                    dd          |	d         |	                    dd          |
d         |
d         |
d         |
                    dd                    |t          j        <   n| j         t          j        k    rnddlm}  ||t          j                 | j        d         | j        d         | j        d         | j                            dd                    |t          j        <   npt-          j        |t          j                 | j        d         | j        d         | j        d         | j                            dd                    |t          j        <   i }d}|t          j                                                 D ]d\  }}| j        |         }||vri ||<   ||                             ||i           ||                                |                                z  z  }et;          |          }i |                                D ]\  }}t=          j        |                      d          |                      d                    }||<   tC          j"        | j#        d| |          }|j$        %                    dd           | j&        s+|'                    d          }tQ          j)        ||           n)|'                    d          }tU          ||dd i!           tV          ,                    d"tZ          j.        /                    |          d#z  d$d%|            !| j&        r-fd&| j                                        D             }t`          }n,fd'| j                                        D             }tb          }tC          j"        | j#        d| |          }d(|i|d)}te          |d*          5 }tg          j4        ||d+,           ddd           n# 1 swxY w Y   t          j5        |v rtC          j"        | j#        d| tl                    '                    d-          }|j$        %                    dd           tQ          j)        |t          j5                 |           tV          ,                    d.tZ          j.        /                    |          d#z  d$d%|            | j         t          j        t          j        fv rtV          7                    d/           n| j         t          j        k    rtV          7                    d0           n}t-          j8        |t          j5                 | j        d         | j        d         | j        d         | j                            dd                    |t          j5        <   tC          j"        | j#        d| tl                    }|j$        %                    dd           | j&        s;|'                    d          }tQ          j)        |t          j5                 |           n9|'                    d          }tU          |t          j5                 |dd i!           tV          ,                    d.tZ          j.        /                    |          d#z  d$d%|            n|rts          d1          t          j:        |v rg| j         t          j        t          j        fv rtV          7                    d2           n,| j         t          j        k    rtV          7                    d3           nt-          j;        |t          j:                 | j<        4          |t          j:        <   tC          j"        | j#        d| tz                    '                    d5          }te          |d*          5 }tg          j4        |t          j:                 |           ddd           n# 1 swxY w Y   tV          ,                    d.tZ          j.        /                    |          d#z  d$d%|            t}          | j?        tC          j"        | j#        d|           t          6           |r|A                    t          j        d          }|A                    t          j5        d          }|A                    t          j:        d          }tC          j"        | j#        t          d7          }|j$        %                    dd           tQ          j)        ||           tV          ,                    d8tZ          j.        /                    |          d#z  d$d%|            dS tV          ,                    d9           |r*tV          ,                    d:| jC        jD         d;           dS tV          ,                    d<           dS )=a  
        Save HF checkpoint to file. If ``intermediate_checkpoint`` is True, an additional
        checkpoint file ``recipe_state.pt`` is created in ``_output_dir/RECIPE_STATE_DIRNAME``
        which contains the recipe state.

        The state_dict is first converted back to the HF format and then partitioned based on the
        ``_weight_map`` into separate checkpoint files.

        Args:
            state_dict (Dict[str, Any]): Checkpoint state dict to be written out to file
            epoch (int): Epoch number. Used to create the checkpoint file name
            intermediate_checkpoint (bool): If True, an additional checkpoint files for recipe state
                and (if applicable) adapter weights are created. Default is False
            adapter_only (bool): If True, only save the adapter weights. Default is False

        Raises:
            ValueError: if ``adapter_only`` is True and adapter checkpoint not found in state_dict.
        r   )phi3_tune_to_hf)reward_tune_to_hfr   r   r   r   )qwen2_tune_to_hfr   r   )llama3_vision_tune_to_hfr   r   r   Nr   r   r   r   r   r   )gemma2_tune_to_hfr   r~   r   r   TrL   r   .safetensorsr   pt)metadatar   r   r   r   c                 .    i | ]\  }}||         d z   S )r   r+   rY   kr   map_original_name_to_new_names      r.   
<dictcomp>z;FullModelHFCheckpointer.save_checkpoint.<locals>.<dictcomp>  s:       "7 4W=N  r1   c                 .    i | ]\  }}||         d z   S )r   r+   r   s      r.   r  z;FullModelHFCheckpointer.save_checkpoint.<locals>.<dictcomp>  s:       "7 4W=F  r1   
total_size)r   
weight_mapw   )indentr   r   z^Saving Phi adapter weights to PEFT format is not supported, saving to torchtune format insteadzjSaving Llama3.2 Vision adapter weights to PEFT format is not supported, saving to torchtune format insteadr   zGPEFT integration for Phi is not supported, skipping adapter config savezSPEFT integration for Llama3.2 Vision is not supported, skipping adapter config save)adapter_configbase_model_name_or_pathr   r   r   r   r   r   r   r   )Erf   r   r   r   r   r   r   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
tune_to_hfr   r   rv   numelelement_sizer`   r   r   r   r   r   rg   r   rh   r   r   r   r   r   rd   rm   r   rZ   r   r   r!   r   r   dumpru   r   re   tune_to_peft_adapter_weightsra   r   tune_to_peft_adapter_configr   r   r   rb   r    r   r   r   r5   )r,   r2   ry   rz   r{   r   r   r   r   r   r   r   split_state_dictsr  r   weightr   r   model_state_dictr   r   r  index_file_name
index_path
index_datafr   r  s                              @r.   r4   z'FullModelHFCheckpointer.save_checkpoint}  s   4  J	3I$7#HHHRRRRRR1@x122 2
8-.. !Y%555BBBBBB1B1Bx12"l+@A!%.C!D]3	2 2 2
8-.. !Y_44TTTTTT1A1Ax12"l+@A!%.C!D]3(,5J(K2 2 2
8-.. !Y%<<<      #l..}bAA $ 0 0" E E1I1Ix12)*?@!,-B!C#M2(__Z>>*<8+6??0$, , !.m <+L9+O<,9,=,=14- -2 2 2
8-.." !Y%555VVVVVV1B1Bx12"l+@A!%.C!D]3!\--j$??2 2 2
8-.. 2A1Kx12"l+@A!%.C!D]3!\--j$??2 2 2
8-. EGJ)(*<=CCEE E EV*3/ "33313%g.!'*113-@@@fllnnv/B/B/D/DDD

 .//J,.)->-D-D-F-F  )) )/&L..q11o>S>STU>V>V  
 :D-g6"m$&6u&6&6
  "(((EEE/ X"-"9"9&"A"AKJ/===="-"9"9."I"IK.xQUFVWWWW.w{33g=H. . +. .    ' 4   &*&6&<&<&>&>  
 #9   &*&6&<&<&>&>  
 #4 "25"2"2O J
 *:6( J j#&& 3!	*a22223 3 3 3 3 3 3 3 3 3 3 3 3 3 3 :-- - "25"2"24G k%    $$TD$AAAJz("67EEEKK*7??;//'9D* *'* *   I$7#HHHt    !Y%<<< A    $@x34"l+@A!%.C!D]3!\--j$??   ( #m$&6u&6&68K  "(((EEE/ 	"-"9"9&"A"AKJz(*>?MMMM"-"9"9."I"IK"8#78#"*D!1   
 .w{33g=H. . +. .   
  	y   "j00I$7#HHH]    !Y%<<<i    $?#-h.E#F,0L   + #m$&6u&6&68L +g&&  +s++ FqIj)@A1EEEF F F F F F F F F F F F F F F.w{33g=H. . +. .   	 M$*,<U,<,<==0	
 	
 	
 	
 # 	x1488Ax3T::Ax6==A- "68I K $$TD$AAAJz;///KK*7??;//'9D* *'* *     KK8999 
W>2W W W     Y    s$   :VV"%V"&ff
f)NNFTFr   )r5   r6   r7   r8   r9   r
   r   r   r   r   ro   r   r/   r   r4   r+   r1   r.   r   r   n  sg       # #V -1+/',#').Q QQ  S	4S> 9:Q 	Q
 Q %SMQ $C=Q !%Q !Q #'Q 
Q Q Q QfT$c3h T$ T$ T$ T$t )."a acNa a "&	a
 a 
a a a a a ar1   r   c                       e Zd ZdZ	 	 	 	 ddedee         dededee         d	ee         d
ededdfdZde	ee
f         fdZ	 	 dde	ee
f         dedededdf
dZdS )FullModelMetaCheckpointera  
    Checkpointer which reads and writes checkpoints in Meta's format. Examples include
    the Llama-2-7b model from the meta-llama repo (https://huggingface.co/meta-llama/Llama-2-7b)

    Currently we support reading from a single checkpoint file only. Support for reading from
    sharded checkpoints is WIP.

    Args:
        checkpoint_dir (str): Directory containing the checkpoint files
        checkpoint_files (List[str]): List of checkpoint files to load. Currently this checkpointer only
            supports loading a single checkpoint file.
        model_type (str): Model type of the model for which the checkpointer is being loaded, e.g. LLAMA3.
        output_dir (str): Directory to save the checkpoint files
        adapter_checkpoint (Optional[str]): Path to the adapter weights. If None,
            and `should_load_recipe_state=True`, then look for adapter_model.pt in output_dir/epoch_{largest_epoch}.
            Default is None.
        recipe_checkpoint (Optional[str]): Path to the recipe state checkpoint file. If None,
            and `should_load_recipe_state=True`, then look for recipe_state.pt in output_dir/recipe_state.
            Default is None.
        resume_from_checkpoint (bool): If True, the checkpointer will load the additional checkpoint files corresponding to
                the recipe state from a previous run. Default is False. This flag is deprecated. Please use the
                should_load_recipe_state instead.
        should_load_recipe_state (bool): If True, the checkpointer will load the additional checkpoint files corresponding to
                the recipe state from a previous run. Default is False

    Raises:
        ValueError: If ``checkpoint_files`` is not a list of length 1
    NFr<   r=   r>   r?   r@   rA   rB   rC   r(   c	                 4   t          |          dk    r t          dt          |           d          t          |          | _        || _        |r!|| _        t
                              d           t          |         | _        t          |          | _	        t          | j        | j	                   | j	                            dd           t          | j	        || j        d	          | _        t          | j	        || j        
          | _        t!          || j        | j	        | j        | j        d u          | _        | j        d         | _        | j        r>t
                              dd | j        D              d| j         d| j                    d S d S )NrE   rF   rG   rH   rI   TrL   rO   rP   rR   rS   r   rU   c                 ,    g | ]}t          |          S r+   rW   rX   s     r.   r[   z6FullModelMetaCheckpointer.__init__.<locals>.<listcomp>  r\   r1   r]   r^   r_   rn   s	            r.   ro   z"FullModelMetaCheckpointer.__init__  s      A%%>+,,> > >  
  $N33)A&! 	-CD*NNj   %Z0
++#)43C	
 	
 	
 	
 	td;;; $?'1%)%C#	$
 $
 $
  #='/%)%C#
 #
 #
 ";-/'%)%C#'#;4#G"
 "
 "
 !% 6q 9) 	KKF)W)W@V)W)W)WF F*.*AF F ,0+CF F    	 	r1   c                 <   i }t          | j                  }| j        t          j        k    rddlm}  ||          |t          j        <   n!t          j
        |          |t          j        <   | j        t          j        k    r?t                              d           |t          j                                     d           | j        r#t          | j                  }||t          j        <   | j        r+t          | j        d          }|                    |           |S )zi
        Load Meta checkpoint from file. Currently only loading from a single file is supported.
        r   )llama3_vision_meta_to_tunezyIdentified model_type = Llama3_2. Ignoring output.weight in checkpoint in favor of the tok_embedding.weight tied weights.output.weightFrr   )r   rl   rf   r   r   r   r  r   rt   r   meta_to_tuneLLAMA3_2rd   rm   r   ri   ru   rc   rj   rv   )r,   r2   r  r  rw   rx   s         r.   r/   z)FullModelMetaCheckpointer.load_checkpoint  s;    %'
*4+@AAy666      .H-G . .Jx)** .=-I . .Jx)*
 y111KK!  
 x)*..???# 	B!01I!J!J/AJx+,) 	,*4+BOOOLl+++r1   r2   ry   rz   r{   c                    |sz|t           j                 }| j        t          j        k    rddlm}  ||          |t           j        <   nE| j        t          j        k    rd|vr|d         |d<   t          j	        |          |t           j        <   t          j        d                    d          d                    d                    }t          j        | j        d| |                              d	          }|j                            d
d
           t'          j        |t           j                 |           t*                              dt.          j                            |          dz  dd|            t           j        |v rt          j        | j        d| t6                                        d          }	|	j                            d
d
           t'          j        |t           j                 |	           t*                              dt.          j                            |	          dz  dd|	            n|rt9          d          t;          | j        t          j        | j        d|           t>                     |r|                     t           j        d          }
|                     t           j        d          }
|                     t           j!        d          }
t          j        | j        tD          d          }	|	j                            d
d
           t'          j        ||	           t*                              dt.          j                            |	          dz  dd|	            dS t*                              d           |r*t*                              d| j#        j$         d           dS t*                              d           dS )a*  
        Save Meta checkpoint to file. If ``intermediate_checkpoint`` is True, an additional
        checkpoint file ``recipe_state.pt`` is created in ``_output_dir/RECIPE_STATE_DIRNAME``
        which contains the recipe state.

        Args:
            state_dict (Dict[str, Any]): Checkpoint state dict to be written out to file
            epoch (int): Epoch number. Used to create the checkpoint file name
            intermediate_checkpoint (bool): If True, an additional checkpoint files for recipe state
                and (if applicable) adapter weights are created. Default is False
            adapter_only (bool): If True, only save the adapter weights. Default is False

        Raises:
            ValueError: if ``adapter_only`` is True and adapter checkpoint not found in state_dict.
        r   )llama3_vision_tune_to_metar  ztok_embeddings.weightr}   r~   r   r   r   TrL   r   r   r   r   r   r   r   r   Nr   r   r   r   r   r   )%r   rt   rf   r   r   r   r#  r!  r   tune_to_metar   r   r   r   r   rg   r   r   rh   r   r   rd   rm   r   rZ   r   ru   r   ra   r   rb   r    r   r   r   r   r5   )r,   r2   ry   rz   r{   r  r#  model_filenamecheckpoint_filer   r   s              r.   r4   z)FullModelMetaCheckpointer.save_checkpoint)  sE   .  &	)(*<=9#:::      2L1K$2 2
8-.. $	(:::'/???8H/9$_5 2A1M$2 2
8-.
 )/		!1  N #m "25"2"2N k&!!  "(((EEEJz("45GGGKK.7???33g=H. .+. .   :--- "25"2"24G k%    $$TD$AAAJz("67EEEKK*7??;//'9D* *'* *     	y   	 M$*,<U,<,<==0	
 	
 	
 	
 # 	x1488Ax3T::Ax6==A- "68I K $$TD$AAAJz;///KK*7??;//'9D* *'* *     KK8999 
W>2W W W     Y    r1   r   r   r   r+   r1   r.   r  r    sB        F -1+/',).C CC s)C 	C
 C %SMC $C=C !%C #'C 
C C C CJ#c3h # # # #R )."t tcNt t "&	t
 t 
t t t t t tr1   r  c            	           e Zd ZdZ	 ddededeej                 ddfdZdee         fdZ		 dd	e
eef         d
ee         de
eef         fdZ	 dd	e
eef         dededdfdZdS )DistributedCheckpointera  
    Checkpointer which reads and writes checkpoints in the DistributedCheckpointing format.

    Args:
        checkpoint_dir (str): Directory containing the checkpoint files
        output_dir (str): Directory to save the checkpoint files
        process_group (Optional[dist.ProcessGroup]): Optional process group to use
            for distributed saving/loading. If None, the default process group will be used.
            For checkpointing, gloo CPU-based backend is needed.
    Nr<   r?   process_groupr(   c                     t          |          | _        t          |          | _        d | _        d| _        d| _        t                      \  }| _        || _        d S )N
dist_epochz	.metadata)	r   rb   rg   _checkpoint_future_checkpoint_dir_prefix_metadata_filer#   _rank_process_group)r,   r<   r?   r)  r   s        r.   ro   z DistributedCheckpointer.__init__  s[      $N33
++"&&2#)/114:;Hr1   c                     t          j         j         d           fdt          j         j                  D             }|r=t          |d           d         }t          j                             j        |          S dS )a`  
        This method iterates over the available intermediate distributed checkpoints and
        finds the latest checkpoint to load.

        Returns:
            str: The fully qualified path of the checkpoint directory containing the latest and valid
            intermediate checkpoint. A valid checkpoint needs to have the metadata file.
        z_(\d+)c           	          g | ]a}t          j        |          t          j                            t          j                            j        |j                            _|bS r+   )rematchr   rZ   isfilejoinrg   r.  )rY   namecheckpoint_dir_patternr,   s     r.   r[   zODistributedCheckpointer._get_latest_intermediate_checkpoint.<locals>.<listcomp>  sq     
 
 
x.55
 T-tT5HII 	

 
 
r1   c                 R    t          |                     d          d                   S )Nr   )r   split)xs    r.   <lambda>zMDistributedCheckpointer._get_latest_intermediate_checkpoint.<locals>.<lambda>  s    AGGCLL4D0E0E r1   )r   r:  N)	r3  compiler-  r   listdirrg   sortedrZ   r6  )r,   checkpoint_pathslatest_checkpoint_dirr8  s   `  @r.   #_get_latest_intermediate_checkpointz;DistributedCheckpointer._get_latest_intermediate_checkpoint  s     "$t/J,S,S,S!T!T
 
 
 
 

4#344
 
 
  	I$* &E&E% % %%! 7<< 02GHHHtr1   r2   checkpoint_pathc                     |t          d          |%|                                 }|t          d          t          t          d|            t	          |t          |          | j                   |S )z
        Load a Distributed checkpoint saved at the <checkpoint_path>
        If no path is provided, latest intermediate checkpoint is loaded.
        Nz=State dict must be provided to load a distributed checkpoint.zNo checkpoint path was provided.Also, No intermediate checkpoint was found in the output directory.Please ensure that a checkpoint exists to load.zLoading checkpoint from r   )r2   storage_readerr)  )ra   rC  r$   rd   r   r   r0  )r,   r2   rD  s      r.   r/   z'DistributedCheckpointer.load_checkpoint  s     O  
 ""FFHHO& F   	f"N_"N"NOOOO!+O<<-	
 	
 	
 	
 r1   Fry   
save_asyncc           	          t          t          d|            t          j         j         j         d|            j        r j                                        st          j	                    }t          
                    d j         d            j                                         t          
                    d j         dt          j	                    |z
  dd           d	 _        t          j	                    }|rd
t          dd	f fd}t          |t          ddd           j                   _        t          
                    d j         dt          j	                    |z
  dd            j                            |           nBt          t          d d           t%          |t          ddd           j                   t          t          d           d	S )aa  
        Save a distributed checkpoint to storage.
        If ``save_async`` is True, the save happens asynchronously unblocking the GPUs sooner. This
        should only be used for the intermediate checkpoints. Final checkpoint has to be a synchronous
        one as the finetuning job can not terminate until the checkpoint gets persisted.

        Args:
            state_dict (Dict[str, Any]): Checkpoint state dict to be written out to file
            epoch (int): Epoch number. Used to create the checkpoint file name
            save_async (bool): If True, save the checkpoint asynchronously
        z=DistributedCheckpointer is saving a checkpoint for the epoch rF  r   Rank zW: previous checkpoint has not finished. Checkpointing frequency is too high. Waiting...z	: waited r   z* seconds for previous checkpoint to finishNr  r(   c           	          |                                  (t                              dj         d d           d S t                              dj         d d|                                              d S )NrJ  z(: Checkpoint is saved asynchronously to z successfully.z.: Checkpoint failed to save asynchronously to z with the exception )	exceptionrd   rm   r/  error)r  rD  r,   s    r.   callbackz9DistributedCheckpointer.save_checkpoint.<locals>.callback%  s     ;;==(KKs
ssTcsss     LL>
 > >Zi > >./kkmm> >    r1      F)thread_countsingle_file_per_rank
sync_files)r2   storage_writerr)  z: Trainer was blocked for z' seconds for checkpointing to finish...z)Saving model checkpoint synchronously to .zThe full model checkpoint, including all the weights and configurations, has been saved successfully by the DistributedCheckpointer. You can now use this checkpoint for further training.)r$   rd   r   r   rg   r-  r,  donetimeperf_counterrm   r/  resultr   r   r   r0  add_done_callbackr   )r,   r2   ry   rH  
wait_startcp_startrN  rD  s   `      @r.   r4   z'DistributedCheckpointer.save_checkpoint  s   $ 	WPUWW	
 	
 	
 	

 -!<FFuFF
 
 " 	+4+B+G+G+I+I 	+*,,JKK{
{{{   #**,,,KK}
}}T->-@-@:-M}}}}   '+D#$&& /	       '1%/#!#).$	      #1	' 	' 	'D# KK1
 1 1d>O>Q>QT\>\k 1 1 1  
 #55h????RRRR   
 %/#!#).$	      #1	 	 	 	 	d	
 	
 	
 	
 	
 	
r1   r*   )NN)F)r5   r6   r7   r8   r9   r   distProcessGroupro   rC  r   r   r/   r   r   r4   r+   r1   r.   r(  r(    s6       	 	 6:	I II I   12	I
 
I I I IXc]    8 SW   sCx. BJ3- 	c3h       L !	a
 a
cNa
 a
 	a

 
a
 a
 a
 a
 a
 a
r1   r(  ):r   r   r   r3  rV  concurrent.futuresr   pathlibr   typingr   r   r   r   r	   r
   r   torch.distributeddistributedr\  safetensors.torchr   torch.distributed.checkpointr   r   r   r   r   	torchtuner   torchtune.modelsr   'torchtune.training.checkpointing._utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   torchtune.utilsr"   r#   r$   rd   r'   r;   r   r  r(  r+   r1   r.   <module>ri     s	   
			  				 				  % % % % % %       = = = = = = = = = = = = = = = =              ' ' ' ' ' '                    , , , , , ,                                 " O N N N N N N N N N	G		A A A A AX A A AHt t t t t%; t t tnp p p p p4 p p pf| | | | | 6 | | |~x
 x
 x
 x
 x
4 x
 x
 x
 x
 x
r1   