
     `iNA                        d Z ddlmZmZmZ ddlZ	 ddlmZ n# e	e
f$ r	 ddlmZ Y nw xY wddlmZ  eej        j        d          rej        j        j        Znej        j        Z G d d	ej                  Z	 	 	 	 	 	 	 	 	 d dedededededededee         dee         dededeee                  fdZ G d de          Z G d d          ZdS )!z?Functions and classes related to optimization (weight updates).    )CallableOptionalUnionN)Adam   )keraslearning_rate_schedulec                   V     e Zd ZdZ	 	 ddededededee         f
 fd	Z	d
 Z
d Z xZS )WarmUpa  
    Applies a warmup schedule on a given learning rate decay schedule.

    Args:
        initial_learning_rate (`float`):
            The initial learning rate for the schedule after the warmup (so this will be the learning rate at the end
            of the warmup).
        decay_schedule_fn (`Callable`):
            The schedule function to apply after the warmup for the rest of training.
        warmup_steps (`int`):
            The number of steps for the warmup part of training.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for the polynomial warmup (defaults is a linear warmup).
        name (`str`, *optional*):
            Optional name prefix for the returned tensors during the schedule.
          ?Ninitial_learning_ratedecay_schedule_fnwarmup_stepspowernamec                     t                                                       || _        || _        || _        || _        || _        d S N)super__init__r   r   r   r   r   )selfr   r   r   r   r   	__class__s         p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/optimization_tf.pyr   zWarmUp.__init__7   sE     	%:"(
!2			    c                     t          j         j        pd          5 }t          j        t           j                  }t          j         j        t           j                  }||z  } j        t           j                            | j	                  z  t          j
        ||k     fd fd|          cd d d            S # 1 swxY w Y   d S )Nr   c                       S r    )warmup_learning_rates   r   <lambda>z!WarmUp.__call__.<locals>.<lambda>P   s    , r   c                  >                           j        z
            S r   )r   r   )r   steps   r   r   z!WarmUp.__call__.<locals>.<lambda>Q   s    ..td6G/GHH r   r   )tf
name_scoper   castfloat32r   r   mathpowr   cond)r   r    r   global_step_floatwarmup_steps_floatwarmup_percent_doner   s   ``    @r   __call__zWarmUp.__call__F   s   ]49011 	T !#bj 9 9!#):BJ!G!G"36H"H#'#=L_aeak@l@l#l 7!$66,,,,HHHHH	  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   BCCCc                 D    | j         | j        | j        | j        | j        dS )Nr   r   r   r   r   r.   r   s    r   
get_configzWarmUp.get_configU   s-    %)%?!%!7 -ZI
 
 	
r   )r   N)__name__
__module____qualname____doc__floatr   intr   strr   r,   r0   __classcell__r   s   @r   r   r   %   s         , " $ $ 	
  sm       
 
 
 
 
 
 
r   r           ?+?:0yE>r   init_lrnum_train_stepsnum_warmup_stepsmin_lr_ratio
adam_beta1
adam_beta2adam_epsilonadam_clipnormadam_global_clipnormweight_decay_rater   include_in_weight_decayc                    t                               | ||z
  | |z  |
          }|rt          | ||          }|	dk    rt          ||	|||||g d|	  	        }n%t          j                            ||||||          }||fS )a  
    Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.

    Args:
        init_lr (`float`):
            The desired learning rate at the end of the warmup phase.
        num_train_steps (`int`):
            The total number of training steps.
        num_warmup_steps (`int`):
            The number of warmup steps.
        min_lr_ratio (`float`, *optional*, defaults to 0):
            The final learning rate at the end of the linear decay will be `init_lr * min_lr_ratio`.
        adam_beta1 (`float`, *optional*, defaults to 0.9):
            The beta1 to use in Adam.
        adam_beta2 (`float`, *optional*, defaults to 0.999):
            The beta2 to use in Adam.
        adam_epsilon (`float`, *optional*, defaults to 1e-8):
            The epsilon to use in Adam.
        adam_clipnorm (`float`, *optional*, defaults to `None`):
            If not `None`, clip the gradient norm for each weight tensor to this value.
        adam_global_clipnorm (`float`, *optional*, defaults to `None`)
            If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
            weight tensors, as if they were concatenated into a single vector.
        weight_decay_rate (`float`, *optional*, defaults to 0):
            The weight decay to use.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for PolynomialDecay.
        include_in_weight_decay (`list[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters except bias and layer norm parameters.
    )r   decay_stepsend_learning_rater   )r   r   r   r:   )	LayerNorm
layer_normbias)	learning_raterG   beta_1beta_2epsilonclipnormglobal_clipnormexclude_from_weight_decayrH   )rO   rP   rQ   rR   rS   rT   )	schedulesPolynomialDecayr   AdamWeightDecayr   
optimizersr   )r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   r   rH   lr_schedule	optimizers                 r   create_optimizerr\   _   s    \ ++%#&66!L0	 ,  K  
")))
 
 

 3#%/ "0&I&I&I$;

 

 

		 $))% "0 * 
 
	 k!!r   c                   
    e Zd ZdZ	 	 	 	 	 	 	 	 	 dd	eeej        f         d
ededededede	e
e                  de	e
e                  def fdZe fd            Z fdZd Zd fd	Zd Zd fd	Zd fd	Z fdZd Z xZS )rX   am
  
    Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
    loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
    with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
    Regularization](https://huggingface.co/papers/1711.05101).

    Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
    to adding the square of the weights to the loss with plain (non-momentum) SGD.

    Args:
        learning_rate (`Union[float, LearningRateSchedule]`, *optional*, defaults to 0.001):
            The learning rate to use or a schedule.
        beta_1 (`float`, *optional*, defaults to 0.9):
            The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
        beta_2 (`float`, *optional*, defaults to 0.999):
            The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
        epsilon (`float`, *optional*, defaults to 1e-07):
            The epsilon parameter in Adam, which is a small constant for numerical stability.
        amsgrad (`bool`, *optional*, defaults to `False`):
            Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
            Beyond](https://huggingface.co/papers/1904.09237).
        weight_decay_rate (`float`, *optional*, defaults to 0.0):
            The weight decay to apply.
        include_in_weight_decay (`list[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
        exclude_from_weight_decay (`list[str]`, *optional*):
            List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
            `include_in_weight_decay` is passed, the names in it will supersede this list.
        name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
            Optional name for the operations created when applying gradients.
        kwargs (`dict[str, Any]`, *optional*):
            Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
            norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
            inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
            `learning_rate` instead.
    MbP?r;   r<   Hz>Fr:   NrO   rP   rQ   rR   amsgradrG   rH   rU   r   c
                 p     t                      j        ||||||	fi |
 || _        || _        || _        d S r   )r   r   rG   _include_in_weight_decay_exclude_from_weight_decay)r   rO   rP   rQ   rR   r`   rG   rH   rU   r   kwargsr   s              r   r   zAdamWeightDecay.__init__   sM     	$YYRXYYY!2(?%*C'''r   c                 \    dt           i}t                                          ||          S )z?Creates an optimizer from its config with WarmUp custom object.r   )custom_objects)r   r   from_config)clsconfigrf   r   s      r   rg   zAdamWeightDecay.from_config   s,     #F+ww""6."IIIr   c                     t                                          |||           t          j        | j        d          |||f         d<   d S )Nadam_weight_decay_rater!   rG   )r   _prepare_localr"   constantrG   )r   
var_device	var_dtypeapply_stater   s       r   rl   zAdamWeightDecay._prepare_local   sW    z9kBBBDFK")AE
 E
 E
Z+,-@AAAr   c                     |                      |j                  }|r?|                    ||z  ||j        |j        j        f         d         z  | j                  S t          j                    S )NrG   )use_locking)	_do_use_weight_decayr   
assign_subdevicedtype
base_dtype_use_lockingr"   no_op)r   varrO   rp   do_decays        r   _decay_weights_opz!AdamWeightDecay._decay_weights_op   sr    ,,SX66 	>>#k3:sy?S2T&UVi&jj - "    xzzr   c                     t          t          |           \  }} t                      j        t          ||          fd|i|S )Nr   )listzipr   apply_gradients)r   grads_and_varsr   rd   gradstvarsr   s         r   r   zAdamWeightDecay.apply_gradients  sG    C011u&uww&s5%'8'8NNtNvNNNr   c                     || j         |         i fS |pi }|                    ||f          }||                     ||          }||||f<   |d         d|ifS )z1Retrieves the learning rate with the given state.Nlr_trp   )_decayed_lr_tget_fallback_apply_state)r   rn   ro   rp   coefficientss        r   _get_lrzAdamWeightDecay._get_lr  sz    %i0"44!'R"
I'>??55j)LLL3?KY/0F#m[%AAAr   c                    |                      |j        |j        j        |          \  }}|                     |||          }t          j        |g          5   t                      j        ||fi |cd d d            S # 1 swxY w Y   d S r   )	r   ru   rv   rw   r|   r"   control_dependenciesr   _resource_apply_dense)r   gradrz   rp   r   rd   decayr   s          r   r   z%AdamWeightDecay._resource_apply_dense  s    ||CJ	0DkRRf&&sD+>>$eW-- 	F 	F05770sEEfEE	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	Fs   A??BBc                    |                      |j        |j        j        |          \  }}|                     |||          }t          j        |g          5   t                      j        |||fi |cd d d            S # 1 swxY w Y   d S r   )	r   ru   rv   rw   r|   r"   r   r   _resource_apply_sparse)	r   r   rz   indicesrp   r   rd   r   r   s	           r   r   z&AdamWeightDecay._resource_apply_sparse  s    ||CJ	0DkRRf&&sD+>>$eW-- 	P 	P15771$WOOOO	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	Ps   B  BBc                     t                                                      }|                    d| j        i           |S )NrG   )r   r0   updaterG   )r   ri   r   s     r   r0   zAdamWeightDecay.get_config  s8    ##%%*D,BCDDDr   c                     | j         dk    rdS | j        r| j        D ]	}||v r dS 
| j        r| j        D ]	}||v r dS 
dS )z0Whether to use L2 weight decay for `param_name`.r   FT)rG   rb   rc   )r   
param_namers      r   rs   z$AdamWeightDecay._do_use_weight_decay#  s    !Q&&5( 	 2    
??44 # * 	!4 ! !
?? 55 #tr   )	r^   r;   r<   r_   Fr:   NNrX   r   )r1   r2   r3   r4   r   r5   rV   LearningRateScheduleboolr   r~   r7   r   classmethodrg   rl   r|   r   r   r   r   r0   rs   r8   r9   s   @r   rX   rX      s       $ $P GL#&7;9=%D DUI$BBCD D 	D
 D D !D "*$s)!4D $,DI#6D D D D D D D$ J J J J [J

 
 
 
 
  O O O O O OB B BF F F F F FP P P P P P    
      r   rX   c                   P    e Zd ZdZd Zed             Zed             Zd Zd Z	dS )GradientAccumulatoraR  
    Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
    replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
    then call `.gradients`, scale the gradients if required, and pass the result to `apply_gradients`.
    c                 "    g | _         d| _        dS )zInitializes the accumulator.N)
_gradients_accum_stepsr/   s    r   r   zGradientAccumulator.__init__@  s     r   c                     | j         Yt          j        t          j        dt          j                  dt          j        j        t          j        j                  | _         | j         	                                S )zNumber of accumulated steps.Nr   )rv   F	trainablesynchronizationaggregation)
r   r"   Variablerm   int64VariableSynchronizationON_READVariableAggregationONLY_FIRST_REPLICAvaluer/   s    r   r    zGradientAccumulator.stepE  sf     $ "ARX... " : B2E	! ! !D  &&(((r   c                 P    | j         st          d          d | j         D             S )z1The accumulated gradients on the current replica.zBThe accumulator should be called first to initialize the gradientsc                 >    g | ]}||                                 n|S r   )r   .0gradients     r   
<listcomp>z1GradientAccumulator.gradients.<locals>.<listcomp>W  s,    ggg8H$8   hgggr   )r   
ValueErrorr/   s    r   	gradientszGradientAccumulator.gradientsR  s6      	cabbbggW[Wfggggr   c                    | j         s+| j        }| j                             d |D                        t          |          t          | j                   k    r4t	          dt          | j                    dt          |                     t          | j         |          D ]\  }}|||                    |           | j                            d           dS )z/Accumulates `gradients` on the current replica.c                     g | ]N}|Ht          j        t          j        |          dt           j        j        t           j        j                  n|OS )NFr   )r"   r   
zeros_liker   r   r   r   r   s     r   r   z0GradientAccumulator.__call__.<locals>.<listcomp>^  so     
 
 
 !  + Kh//"'(*(B(J$&$:$M	    "
 
 
r   z	Expected z gradients, but got Nr   )r   r    extendlenr   r   
assign_addr   )r   r   _accum_gradientr   s        r   r,   zGradientAccumulator.__call__Y  s     		AO""
 
 %.
 
 
   y>>S1111cT_)=)=ccSVW`SaSaccddd(+DOY(G(G 	4 	4$NH)h.B))(333$$Q'''''r   c                     | j         sdS | j                            d           | j         D ]+}|'|                    t          j        |                     ,dS )z8Resets the accumulated gradients on the current replica.Nr   )r   r   assignr"   r   )r   r   s     r   resetzGradientAccumulator.resets  sf     	F  ### 	9 	9H#h 7 7888	9 	9r   N)
r1   r2   r3   r4   r   propertyr    r   r,   r   r   r   r   r   r   5  s         ! ! !
 
) 
) X
) h h Xh( ( (49 9 9 9 9r   r   )	r:   r;   r<   r=   NNr:   r   N)r4   typingr   r   r   
tensorflowr"   tf_keras.optimizers.legacyr   ImportErrorModuleNotFoundError"tensorflow.keras.optimizers.legacymodeling_tf_utilsr   hasattrrY   rV   r	   r   r   r5   r6   r~   r7   r\   rX   r   r   r   r   <module>r      sA   F E , , , , , , , , , ,    8///////() 8 8 8777777778 % $ $ $ $ $ 75%'?@@ + *AII *I7
 7
 7
 7
 7
Y+ 7
 7
 7
| %),0"37Q" Q"Q"Q" Q" 	Q"
 Q" Q" Q" E?Q" #5/Q" Q" Q" &d3i0Q" Q" Q" Q"h~ ~ ~ ~ ~d ~ ~ ~DE9 E9 E9 E9 E9 E9 E9 E9 E9 E9s    ))