
     `i                     X   d dl Z d dlmZ d dlmZ ddlmZ ddlmZm	Z	  e	j
        e          Z e            rd dlZd Z	 	 	 d"dee         d	ed
         dee         dedef         fdZ	 	 	 d"dee         d	ed
         dee         dedef         fdZ	 	 	 d"dee         d	ed
         dee         dedef         fdZ	 d#ded	d
dee         dedef         fdZ	 d#ded	d
dee         dedef         fdZ	 d#ded	d
dee         dedef         fdZeeeeeedZ	 	 d$dedededee         dee         f
dZd#dedee         fdZd#dedee         fdZd#dedee         fdZd#dedee         fdZd#dedee         fdZ d#dedee         fd Z!eeeee e!dZ"d#dedee         fd!Z#dS )%    Nwraps)Optional   )PretrainedConfig)is_torch_availableloggingc                 P     d d t                      fd            }|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    c                    t          j        |          dz   }t          | j        d          r| j        j        }n| j        j        }||k    rWt          | d          s(|                     | j        ||dz             \  | _        }|                     d| j        d           dS | j	        
                    |          | _	        |                     d| j	        d           dS )	zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r    original_max_position_embeddingslong_inv_freqseq_leninv_freqF
persistentN)torchmaxhasattrconfigr   max_position_embeddingsrope_init_fnr   register_bufferoriginal_inv_freqto)selfposition_idsdevicer   r   _s         t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_updatez6dynamic_rope_update.<locals>.longrope_frequency_update+   s    )L))A-4; BCC 	S/3{/[,,/3{/R,555411 (,(9(9K1QTU1U ): ) )%"A   T-?E RRRRR &*%;%>%>v%F%FD"  T-CPU VVVVV    c                    t          j        |          dz   }|| j        k    rD|                     | j        ||          \  }| _        |                     d|d           || _        || j        k     rZ| j        | j        k    rL| j        	                    |          | _        |                     d| j        d           | j        | _        dS dS dS )a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   r   r   Fr   N)
r   r   max_seq_len_cachedr   r   attention_scalingr   original_max_seq_lenr   r   )r   r   r   r   r   s        r    dynamic_frequency_updatez5dynamic_rope_update.<locals>.dynamic_frequency_update>   s     )L))A-T,,,/3/@/@f^e/@/f/f,Hd,  X% HHH&-D#T...43JTMf3f3f &*%;%>%>v%F%FD"  T-CPU VVV&*&?D### /.3f3fr"   c                     d| j         v r | ||j                   n| j         dk    r | ||j                    | ||          S )Ndynamic)r   longrope)	rope_typer   )r   xr   r'   r!   rope_forwards      r    wrapperz$dynamic_rope_update.<locals>.wrapperQ   sh    &&$$T<IIIII^z))%%dLJJJJ|D!\222r"   r   )r-   r.   r'   r!   s   ` @@r    dynamic_rope_updater/      sh    W W W&@ @ @& <3 3 3 3 3 3 3 Nr"   r   r   ztorch.devicer   returnztorch.Tensorc                 6   | j         }t          | dd          }t          | dd          p| j        | j        z  }t	          ||z            }d}d|t          j        d|dt
          j                                      |t
          j	                  |z  z  z  }||fS )	a  
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    partial_rotary_factor      ?head_dimNr      dtyper   r7   )

rope_thetagetattrhidden_sizenum_attention_headsintr   arangeint64r   float)	r   r   r   baser2   r4   dimattention_factorr   s	            r     _compute_default_rope_parametersrD   \   s    > D#F,CSIIvz400dF4F&Jd4dH
h..
/
/C du|AsAU[IIILLTZbgbmLnnqttuvH%%%r"   c                 V    | j         d         }t          | ||          \  }}||z  }||fS )a  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    factor)rope_scalingrD   )r   r   r   rF   r   rC   s         r    '_compute_linear_scaling_rope_parametersrH      sD    >  *F "B&&RY!Z!ZH
 H%%%r"   c                 V   | j         }t          | dd          }t          | d| j        | j        z            }t	          ||z            }| j        }| j        d         }d}	||}n_t          |t          j	                  r5t          j
        |t          j        ||j        |j                            }nt          ||          }|||z  |z  |dz
  z
  ||dz
  z  z  z  }d|t          j        d	|dt          j        
                              |t          j                  |z  z  z  }
|
|	fS )a	  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
                inference time
            *   rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
                will be accessed. The value of `factor` is used to determine the new base frequency, along with the
                current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
                computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
                factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
                context window using an exponent derived from `dim`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
            max_position_embeddings, this value will be overridden by max_position_embeddings.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    r2   r3   r4   rF   Nr7   r   r   r5   r   r6   r8   )r9   r:   r;   r<   r=   r   rG   
isinstancer   Tensormaximumtensorr7   r   r   r>   r?   r   r@   )r   r   r   rA   r2   r4   rB   r   rF   rC   r   s              r    _compute_dynamic_ntk_parametersrO      sN   T D#F,CSIIvz6+=A[+[\\H
h..
/
/C$< *F )	GU\	*	* 8-L0gn]]]
 

 g677 FW$'>>6A:NTW[^ab[bTcddDdu|AsAU[IIILLTZbgbmLnnqttuvH%%%r"   c                    | j         }t          | dd          }t          | d| j        | j        z            }t	          ||z            }| j        d         }| j                            d          }| j                            d          }	| j                            d          }
| j                            d          p| j        }dd
}|6|	r)|
r't           |||	           |||
          z            }n ||          }| j                            d          pd}| j                            d          pd	}d fd}d }|t          j
        d|d                              |t          j                  |z  z  }d|z  }d||z  z  }| j                            dd          } |||||||          \  }}d	 ||||dz                                |t          j                  z
  }|d	|z
  z  ||z  z   }||fS )ak  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                    If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as avaialble.
                *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                    (only) in the linear ramp function.
                *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                    (only) in the linear ramp function.
                *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                    extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                    value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                    `mscale_all_dim`, if provided.
                *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                    numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                    calculated based on `factor` only.
                *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                    the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                    will be calculated based on `factor` only.
                *   `original_max_position_embeddings` (`int`, *optional*): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.
                *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r2   r3   r4   rF   rC   mscalemscale_all_dimr   r   c                 L    | dk    rdS d|z  t          j        |           z  dz   S )Nr   r3   g?)mathlog)scalerQ   s     r    
get_mscalez,_compute_yarn_parameters.<locals>.get_mscale:  s,    A::3V|dhuoo-33r"   N	beta_fast    	beta_slowc                     |t          j        || dz  t           j        z  z            z  dt          j        |          z  z  S )zPInverse dimension formula to find the dimension based on the number of rotationsr5   )rT   rU   pi)num_rotationsrB   rA   r   s       r    find_correction_dimz5_compute_yarn_parameters.<locals>.find_correction_dimL  sA    dh6-!:Kdg:UVWWW\]`d`him`n`n\noor"   c                      | |||          } ||||          }|r(t          j        |          }t          j        |          }t          |d          t	          ||dz
            fS )z.Find dimension range bounds based on rotationsr   r   )rT   floorceilr   min)	low_rothigh_rotrB   rA   r   truncatelowhighr^   s	           r    find_correction_rangez7_compute_yarn_parameters.<locals>.find_correction_rangeP  st    !!'36MNN""8S$8OPP 	#*S//C9T??D3{{CcAg....r"   c                     | |k    r|dz  }t          j        |t           j                  | z
  || z
  z  }t          j        |dd          }|S )NgMbP?r6   r   r   )r   r>   float32clamp)rb   r   rB   linear_func	ramp_funcs        r    linear_ramp_factorz4_compute_yarn_parameters.<locals>.linear_ramp_factorY  sQ    #::5LC|Cu}===Cc	RKQ22	r"   r   r5   r8   re   T)r   )r9   r:   r;   r<   r=   rG   getr   r@   r   r>   r   )r   r   r   rA   r2   r4   rB   rF   rC   rQ   rR   r   rW   rX   rZ   rh   rn   	pos_freqsinv_freq_extrapolationinv_freq_interpolationre   rf   rg   inv_freq_extrapolation_factorr   r^   s                            @r    _compute_yarn_parametersrt      s   p D#F,CSIIvz6+=A[+[\\H
h..
/
/C *F*../ABB $$X..F(,,-=>>N BCCevGe %4 4 4 4  	2n 	2$ZZ%?%?**VUcBdBd%dee)z&11 #''44:I#''449Ip p p/ / / / /   aa003363UUX[[\I 9_ FY$67"&&z488H%%iCGgiqrrIC %&(:(:3cQh(O(O(R(RZ`hmhs(R(t(t$t!!&C"CD
 #@
@	A  %%%r"   c                 .   | j         }t          | dd          }t          | d| j        | j        z            }t	          ||z            }| j        d         }| j        d         }| j                            d          }	| j                            d          }
t          | dd	          x}r| j        |z  }	n| j        }|
G|	dk    rd}
n>t          j	        d
t          j
        |	          t          j
        |          z  z             }
|r(||k    r"t          j        |t          j        |          }n!t          j        |t          j        |          }t          j        d|dt          j        |                                          |z  }d|||z  z  z  }||
fS )a  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
                pretraining. If not provided, defaults to `max_position_embeddings`.
            *   rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
                will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                    the value of `factor`.
                *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                    `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                    overridden s the ratio between those values.
                *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
                *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r2   r3   r4   long_factorshort_factorrF   rC   r   Nr   rJ   r   r5   )r9   r:   r;   r<   r=   rG   ro   r   rT   sqrtrU   r   rN   rj   r>   r?   r@   )r   r   r   rA   r2   r4   rB   rv   rw   rF   rC   r   ext_factorsinv_freq_shaper   s                  r    _compute_longrope_parametersr{   s  s   ^ D#F,CSIIvz6+=A[+[\\H
h..
/
/C%m4K&~6L $$X..F*../ABB
 ,36;]_c+d+dd' J/2RR+1+I( S=="#yTXf-=-=Ii@j@j-j)jkk  U7===l;emFSSSl<u}VTTT\!S!5;vNNNTTVVY\\NkD.$889H%%%r"   c                    t          | ||          \  }}| j        d         }| j        d         }| j        d         }| j        d         }||z  }	||z  }
dt          j        z  |z  }t	          j        ||	k    ||z  |          }||z  |z
  ||z
  z  }d|z
  |z  |z  ||z  z   }||
k      ||	k     z  }t	          j        |||          }||fS )ap
  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                    wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                    during smoothing.
                *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                    the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
                *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                    the shift applied to the numerator and denominator of the smoothing factor.
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    rF   low_freq_factorhigh_freq_factorr   r5   r   )rD   rG   rT   r\   r   where)r   r   r   r   rC   rF   r}   r~   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqs                   r    _compute_llama3_parametersr     s   T "B&&RY!Z!ZH *F)*;<O*+=>)*LMO&8'*::$'kH$G [+;!;X=NPXYYN$w.@EUXgEghM]*n<vEXfHff!223BR8R6SSN[1BNSSN+++r"   )defaultlinearr)   yarnr*   llama3r+   received_keysrequired_keysoptional_keysignore_keysc                     d|v r|dhz  }|                     d           |||z  }||z
  }|rt          d|  d|           |	||z
  |z
  }n||z
  }|r"t                              d|  d|            dS dS )zYCompare the received keys in `config.rope_scaling` against the expected and optional keystyper+   Nz9Missing required keys in `rope_scaling` for 'rope_type'='z': z5Unrecognized keys in `rope_scaling` for 'rope_type'=')addKeyErrorloggerwarning)r+   r   r   r   r   missing_keysunused_keyss          r    _check_received_keysr     s     &!+&&& $ =0L qoS\ooamooppp #m3mC#m3 ljyjj]hjjkkkkkl lr"   c                     | j         }|                    d|                    dd                     }dh}t          |                                          }t	          ||||           d S )Nr+   r   r   )rG   ro   setkeysr   )r   r   rG   r+   r   r   s         r    !_validate_default_rope_parametersr   0  sl    &L  l.>.>vt.L.LMMI MM))++,,MM=kZZZZZZr"   c                 `   | j         }|                    d|                    dd                     }ddh}t          |                                          }t	          ||||           |d         }|t          |t                    r|dk     rt                              d|            d S d S )Nr+   r   rF   r   r3   8`rope_scaling`'s factor field must be a float >= 1, got 	rG   ro   r   r   r   rK   r@   r   r   )r   r   rG   r+   r   r   rF   s          r    (_validate_linear_scaling_rope_parametersr   8  s    &L  l.>.>vt.L.LMMI (+M))++,,MM=kZZZZ(#F~Z66~&3,,ZRXZZ[[[[[ ;G,r"   c                 h   | j         }|                    d|                    dd                     }ddh}dh}t          |                                          }t	          |||||           |d         }|t          |t                    r|dk     rt                              d|            d S d S )Nr+   r   rF   r   r   r3   r   r   )r   r   rG   r+   r   r   r   rF   s           r    )_validate_dynamic_scaling_rope_parametersr   D  s    &L  l.>.>vt.L.LMMI (+M78M))++,,MM=-]hiiii(#F~Z66~&3,,ZRXZZ[[[[[ ;G,r"   c           	      Z   | j         }|                    d|                    dd                     }ddh}h d}t          |                                          }t	          |||||           |d         }|t          |t                    r|dk     rt                              d|            |                    d          }|8t          |t                    r|d	k     rt                              d
|            |                    d          }	|	2t          |	t                    st                              d|	            |                    d          }
|
2t          |
t                    st                              d|
            |	pd|
pdk     r!t                              d|	 d|
 d           | j                             d          }|8| j	        |z  }||k    r&t          
                    d| d| d| d           d S d S t          
                    d           d S )Nr+   r   rF   >   rQ   re   rX   rZ   rR   rC   r   r   r3   r   rC   r   L`rope_scaling`'s attention_factor field must be a float greater than 0, got rX   z6`rope_scaling`'s beta_fast field must be a float, got rZ   z6`rope_scaling`'s beta_slow field must be a float, got rY   r   zO`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r   zHThe explicitly set RoPE scaling factor (config.rope_scaling['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_scaling['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config.a~  config.rope_scaling['original_max_position_embeddings'], the pre-yarn context length, is unset. We will **assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * factor) -- we recommend updating both fields for optimal downstream model usage.)rG   ro   r   r   r   rK   r@   r   r   r   warning_once)r   r   rG   r+   r   r   r   rF   rC   rX   rZ   r   implicit_factors                r    _validate_yarn_parametersr   R  s   &L  l.>.>vt.L.LMMI (+M  M ))++,,MM=-]hiiii(#F~Z66~&3,,ZRXZZ[[[#''(:;;#Z8H%-P-P#TdghThThm[kmm	
 	
 	
   --IZ	5%A%A[PY[[\\\  --IZ	5%A%A[PY[[\\\RIN++Z^g Z Z6?Z Z Z	
 	
 	
 (.':'>'>?a'b'b$'3 8;[[f$$u[a u u #	u u CI	u u u     %$ 	_	
 	
 	
 	
 	
r"   c                    | j         }|                    d|                    dd                     }h d}h d}t          |                                          }t	          |||||           t          | dd          }t          | d| j        | j        z            }t          ||z            }	|                    d	          }
t          |
t                    s6t          d
 |
D                       rt                              d|
            t          |
          |	dz  k    r0t                              d|	dz   dt          |
                      |                    d          }t          |t                    s6t          d |D                       rt                              d|            t          |          |	dz  k    r0t                              d|	dz   dt          |                      t          | d          rt                              d           d S |                    d          }|t                              d           n8t          |t"                    r|dk     rt                              d|            |                    d          }|:t          |t"                    r|dk     r!t                              d|            d S d S d S )Nr+   r   >   r+   rv   rw   >   rF   rC   r   r   r2   r3   r4   rw   c              3   N   K   | ] }t          |t          t          f          V  !d S NrK   r=   r@   .0r,   s     r    	<genexpr>z0_validate_longrope_parameters.<locals>.<genexpr>  s1      1d1dRS*Qe2M2M1d1d1d1d1d1dr"   zC`rope_scaling`'s short_factor field must be a list of numbers, got r5   z5`rope_scaling`'s short_factor field must have length z, got rv   c              3   N   K   | ] }t          |t          t          f          V  !d S r   r   r   s     r    r   z0_validate_longrope_parameters.<locals>.<genexpr>  s1      0b0bQRAU|1L1L0b0b0b0b0b0br"   zB`rope_scaling`'s long_factor field must be a list of numbers, got z4`rope_scaling`'s long_factor field must have length r   aY  This model has set a `original_max_position_embeddings` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.rF   z1Missing required keys in `rope_scaling`: 'factor'r   rC   g        r   )rG   ro   r   r   r   r:   r;   r<   r=   rK   listallr   r   lenr   r   r@   )r   r   rG   r+   r   r   r   r2   r4   rB   rw   rv   rF   rC   s                 r    _validate_longrope_parametersr     s'   &L  l.>.>vt.L.LMMI@@@MVVVM))++,,MM=-]hiiii#F,CSIIvz6+=A[+[\\H
h..
/
/C##N33LlD)) mc1d1dWc1d1d1d.d.d mk]ikklll
<C1H$$rsVWxrr_bco_p_prrsss""=11Kk4(( kS0b0bVa0b0b0b-b-b ki\giijjj
;3!8##pcUVhpp^abm^n^nppqqq
 v9:: A	
 	
 	
 	
 	
 !!(++>NNNOOOOFE** 	`fsllNN^V\^^___'++,>??'.66 :JS:P:Pucsuu     (':P:Pr"   c                 t   | j         }|                    d|                    dd                     }h d}t          |                                          }t	          ||||           |d         }|t          |t                    r|dk     rt                              d|            |d         }|d	         }|t          |t                    st                              d
|            |t          |t                    st                              d|            ||k    r t                              d| d|            |d         }	|	t          |	t                    st                              d|	            |	| j
        k    r't                              d|	 d| j
                    d S d S )Nr+   r   >   rF   r+   r}   r~   r   r   rF   r3   r   r}   r~   z<`rope_scaling`'s low_freq_factor field must be a float, got z=`rope_scaling`'s high_freq_factor field must be a float, got zc`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r   zP`rope_scaling`'s original_max_position_embeddings field must be an integer, got zg`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)rG   ro   r   r   r   rK   r@   r   r   r=   r   )
r   r   rG   r+   r   r   rF   r}   r~   r   s
             r    _validate_llama3_parametersr     s   &L  l.>.>vt.L.LMMIvvvM))++,,MM=kZZZZ(#F~Z66~&3,,ZRXZZ[[["#45O#$67j%&H&HgVegghhhz2BE'J'JiWgiijjj?**HH H6EH H	
 	
 	

 (44V'W$'/zBbdg7h7h/2/2 2	
 	
 	
 (6+IIIo/o oNTNlo o	
 	
 	
 	
 	
 JIr"   c                    t          | dd          }|dS |                    d|                    dd                    }t                              |          }| || |           dS t                              d| d           dS )	zO
    Validate the RoPE config arguments, given a `PretrainedConfig` object
    rG   Nr+   r   r   r   zTMissing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='')r:   ro   ROPE_VALIDATION_FUNCTIONSr   r   )r   r   rG   r+   validation_fns        r    rope_config_validationr     s     6>488L   l.>.>vy.Q.QRRI-11)<<M f+666666oclooo	
 	
 	
 	
 	
r"   )NNNr   )NN)$rT   	functoolsr   typingr   configuration_utilsr   utilsr   r	   
get_logger__name__r   r   r/   r=   tupler@   rD   rH   rO   rt   r{   r   ROPE_INIT_FUNCTIONSstrr   r   r   r   r   r   r   r   r   r    r"   r    <module>r      s                1 1 1 1 1 1 . . . . . . . . 
	H	%	%  LLL; ; ;~ *.'+!(& (&%&(&^$(& c](& >5 !	(& (& (& (&X *.'+!(& (&%&(&^$(& c](& >5 !	(& (& (& (&X *.'+!A& A&%&A&^$A& c]A& >5 !	A& A& A& A&J PTz& z&z&&4z&?G}z&
>5 !z& z& z& z&| PTO& O&O&&4O&?G}O&
>5 !O& O& O& O&f PT>, >,>,&4>,?G}>,
>5 !>, >, >, >,J 05.$,(   $(!%l lll l C=	l
 #l l l l:[ [.> [XVY] [ [ [ [	\ 	\5E 	\T\]`Ta 	\ 	\ 	\ 	\\ \6F \U]^aUb \ \ \ \?
 ?
&6 ?
Xc] ?
 ?
 ?
 ?
D/ /*: /RU / / / /d!
 !
(8 !
xPS} !
 !
 !
 !
L 168%-)  
 
#3 
(3- 
 
 
 
 
 
r"   