
     `iL                    	   d Z ddlZddlmZ ddlmZmZmZ ddlZddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ  ej        e          Z  G d de
j!                  Z" G d de
j!                  Z# G d de
j!                  Z$ G d de
j!                  Z% G d de
j!                  Z& G d de
j!                  Z'	 	 	 d{de
j!        dej(        dej(        dej(        d eej(                 d!ee)         d"e)d#eej(                 fd$Z* G d% d&e
j!                  Z+ G d' d(e
j!                  Z, G d) d*e
j!                  Z- G d+ d,e
j!                  Z. G d- d.e
j!                  Z/ G d/ d0e
j!                  Z0 G d1 d2e
j!                  Z1e G d3 d4e                      Z2 G d5 d6e
j!                  Z3	 	 	 d|d8ej(        d9e)d:ee4         d;e5d<e6f
d=Z7	 	 d}d8ej(        d>ee4e6f         d:ee4         d<e6fd?Z8 G d@ dAe
j!                  Z9 G dB dCe
j!                  Z: G dD dEe
j!                  Z; G dF dGe
j!                  Z< G dH dIe
j!                  Z=e edJK           G dL dMe                                  Z> G dN dOe2          Z?e edPK           G dQ dRe                                  Z@ edSK           G dT dUe2                      ZAe edVK           G dW dXe                                  ZB edYK           G dZ d[e2                      ZCe ed\K           G d] d^e                                  ZDe ed_K           G d` dae                                  ZEe ed_K           G db dce                                  ZFddejG        jH        deej(        dfej(        fdgZId~dhej(        dieej(                 dfej(        fdjZJ G dk dle2          ZKe edmK           G dn doe                                  ZL G dp dqe2          ZMe edrK           G ds dte                                  ZN G du dve
j!                  ZO edwK           G dx dye2                      ZPg dzZQdS )zPyTorch PatchTSMixer model.    N)	dataclass)CallableOptionalUnion)PreTrainedModel)ModelOutput   )FlashAttentionKwargs)ALL_ATTENTION_FUNCTIONS)Unpack)NegativeBinomialOutputNormalOutputStudentTOutput)auto_docstringlogging   )PatchTSMixerConfigc                   2     e Zd ZdZdedef fdZd Z xZS )PatchTSMixerGatedAttentionz
    Module that applies gated attention to input data.

    Args:
        in_size (`int`): The input size.
        out_size (`int`): The output size.
    in_sizeout_sizec                     t                                                       t          j        ||          | _        t          j        d          | _        d S )Ndim)super__init__nnLinear
attn_layerSoftmaxattn_softmax)selfr   r   	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/patchtsmixer/modeling_patchtsmixer.pyr   z#PatchTSMixerGatedAttention.__init__/   sG    )GX66J2...    c                 `    |                      |                     |                    }||z  }|S N)r"   r    )r#   inputsattn_weights      r%   forwardz"PatchTSMixerGatedAttention.forward4   s0    ''(?(?@@+%r&   )__name__
__module____qualname____doc__intr   r+   __classcell__r$   s   @r%   r   r   &   sd         / /s / / / / / /
      r&   r   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )PatchTSMixerBatchNormzP
    Compute batch normalization over the sequence length (time) dimension.
    configc                     t                                                       t          j        |j        |j                  | _        d S )Neps)r   r   r   BatchNorm1dd_modelnorm_eps	batchnormr#   r5   r$   s     r%   r   zPatchTSMixerBatchNorm.__init__@   s7    FOLLLr&   r)   c                     |                     dd          }|                     |          }|                     dd          S )a  
        Parameters:
            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
                input for Batch norm calculation
        Returns:
            `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
        r      )	transposer<   )r#   r)   outputs      r%   r+   zPatchTSMixerBatchNorm.forwardD   s@     !!!Q''''1%%%r&   
r,   r-   r.   r/   r   r   torchTensorr+   r1   r2   s   @r%   r4   r4   ;   ss         M1 M M M M M M
&el 
& 
& 
& 
& 
& 
& 
& 
&r&   r4   c                   h     e Zd ZdZdef fdZededej        fd            Z	de
j        fdZ xZS )PatchTSMixerPositionalEncodingz'
    Class for positional encoding
    r5   c                     t                                                       |j        r|                     |          | _        d S t          j        t          j        |j	        |j
                            | _        d S r(   )r   r   use_positional_encoding_init_peposition_encr   	ParameterrC   zerosnum_patchesr:   r=   s     r%   r   z'PatchTSMixerPositionalEncoding.__init__V   sh    ) 	^ $f 5 5D "U[9KV^-\-\ ] ]Dr&   returnc                    | j         dk    r5t          j        t          j        | j        | j                  d          }nD| j         dk    r!t          j        | j        | j                  }t          j        d| j                  	                    d          }t          j
        t          j        d| j        d          t          j        d          | j        z   z            }t          j        ||z            |d d dd df<   t          j        ||z            |d d dd df<   ||                                z
  }||                                d	z  z  }t          j        |d
          }nt#          | j          d          |S )NrandomTrequires_gradsincosr   r   r?   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)positional_encoding_typer   rK   rC   randnrM   r:   rL   arange	unsqueezeexpmathlogsincosmeanstd
ValueError)r5   rJ   positiondiv_terms       r%   rI   z'PatchTSMixerPositionalEncoding._init_pe^   s    *h66<F4F(W(WgklllLL,88 ;v'96>JJL|Av'9::DDQGGHya!C!CQXHYHY\b\jHjFk!kllH$)Ih.A$B$BLADqD!$)Ih.A$B$BLADqD!',*;*;*=*==L'<+;+;+=+=+BCL<EJJJLL2  C  C  C   r&   patch_inputc                     || j         z   }|S r(   )rJ   )r#   rc   hidden_states      r%   r+   z&PatchTSMixerPositionalEncoding.forwardr   s    "T%66r&   )r,   r-   r.   r/   r   r   staticmethodr   rK   rI   rC   rD   r+   r1   r2   s   @r%   rF   rF   Q   s         ^1 ^ ^ ^ ^ ^ ^ +     \&5<        r&   rF   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )PatchTSMixerNormLayerzeNormalization block

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r5   c                    t                                                       |j        | _        d|j                                        v rt	          |          | _        d S t          j        |j        |j	                  | _        d S )Nbatchr7   )
r   r   norm_mlplowerr4   normr   	LayerNormr:   r;   r=   s     r%   r   zPatchTSMixerNormLayer.__init__   sl    fo++-----f55DIIIV^IIIDIIIr&   r)   c                 T   d| j                                         v rwt          j        ||j        d         |j        d         z  |j        d         |j        d         f          }|                     |          }t          j        ||j                  }n|                     |          }|S )a  
        Args:
            inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
                Input to the normalization layer.
        Returns:
            `torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`
        rj   r   r   r?   r	   )rk   rl   rC   reshapeshaperm   )r#   r)   inputs_reshapeds      r%   r+   zPatchTSMixerNormLayer.forward   s     dm))++++#mLOfl1o5LOLO O #ii88O ]?FLAAFF YYv&&Fr&   rB   r2   s   @r%   rh   rh   x   ss         J1 J J J J J Jel        r&   rh   c                   4     e Zd Z fdZdej        fdZ xZS )PatchTSMixerMLPc                 <   t                                                       ||j        z  }t          j        ||          | _        t          j        |j                  | _        t          j        ||          | _	        t          j        |j                  | _
        d S r(   )r   r   expansion_factorr   r   fc1Dropoutdropoutdropout1fc2dropout2)r#   in_featuresout_featuresr5   
num_hiddenr$   s        r%   r   zPatchTSMixerMLP.__init__   sv     6#::
9[*55
6>229Z66
6>22r&   r)   c                     |                      t          j                            |                     |                              }|                     |          }|                     |          }|S )z
        Args:
            inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
                Input to the MLP layer.
        Returns:
            `torch.Tensor` of the same shape as `inputs`
        )rz   r   
functionalgelurw   r{   r|   )r#   r)   s     r%   r+   zPatchTSMixerMLP.forward   sX     r}11$((62B2BCCDD&!!v&&r&   )r,   r-   r.   r   rC   rD   r+   r1   r2   s   @r%   rt   rt      sU        3 3 3 3 3el        r&   rt   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )$PatchTSMixerChannelFeatureMixerBlockzThis module mixes the features in the channel dimension.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r5   c                    t                                                       t          |          | _        |j        | _        t          |j        |j        |          | _        |j        r"t          |j        |j                  | _	        d S d S Nr}   r~   r5   r   r   )
r   r   rh   rm   
gated_attnrt   num_input_channelsmlpr   gating_blockr=   s     r%   r   z-PatchTSMixerChannelFeatureMixerBlock.__init__   s    )&11	 +"12
 
 
  	 :1F<U! ! !D	 	r&   r)   c                     |}|                      |          }|                    dddd          }| j        r|                     |          }|                     |          }|                    dddd          }||z   }|S )z
        Args:
            inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
                input to the MLP layer
        Returns:
            `torch.Tensor` of the same shape as `inputs`
        r   r	   r?   r   )rm   permuter   r   r   )r#   r)   residualouts       r%   r+   z,PatchTSMixerChannelFeatureMixerBlock.forward   s     6""1a++? 	/&&v..F&!!1a++x
r&   rB   r2   s   @r%   r   r      sl         1       el        r&   r           modulequerykeyvalueattention_maskscalingry   	head_maskc                    ||                     d          dz  }t          j        ||                    dd                    |z  }	||	|z   }	t          j                            |	d          }	||	|                    dddd          z  }	t          j                            |	|| j	                  }	t          j        |	|          }
|
                    dd          
                                }
|
|	fS )Nr         r?   r	   r   r   )ptraining)sizerC   matmulr@   r   r   softmaxviewry   r   
contiguous)r   r   r   r   r   r   ry   r   kwargsattn_weightsattn_outputs              r%   eager_attention_forwardr      s     **R..D(<s}}Q':':;;gEL!#n4=((2(>>L#innQAq&A&AA=((6?([[L,|U33K''1--88::K$$r&   c                   >    e Zd ZdZ	 	 	 	 	 ddededed	ed
ededee         f fdZ		 	 	 	 dde
j        dee
j                 dee
j                 dee
j                 dee         dee         dee
j        ee
j                 eee
j                          f         fdZ xZS )PatchTSMixerAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsry   
is_decoderbias	is_causalr5   c                 
   t                                                       || _        || _        || _        ||z  | _        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).r   )r   )r   r   r   r   ry   head_dimr5   r`   r   r   r   r   r   k_projv_projq_projout_proj)	r#   r   r   ry   r   r   r   r5   r$   s	           r%   r   zPatchTSMixerAttention.__init__  s    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr&   hidden_stateskey_value_statesr   layer_head_maskoutput_attentionsr   rN   c                    |du}|j         dd         \  }}	|r|j         d         n|	}
||	d| j        f}||
d| j        f} |                     |          j        |                     dd          }|r|n|} |                     |          j        |                     dd          } |                     |          j        |                     dd          }t          }| j        j	        dk    rt          | j        j	                 } || ||||f| j        sdn| j        | j        ||d|\  }}|                    ||	d                                          }|                     |          }||dfS )z#Input shape: Batch x Time x ChannelNr   r   r?   eagerr   )ry   r   r   r   )rq   r   r   r   r@   r   r   r   r5   _attn_implementationr   r   ry   r   rp   r   r   )r#   r   r   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                       r%   r+   zPatchTSMixerAttention.forward2  s    .T9 %*3B3/W/AN"(++wgr4=9wDM: 7t{{=116FPPQRTUVV-?R))]5T[[005~FPPQRTUVV
7t{{>227HRRSTVWXX(?;+w66"9$+:Z"[$7$7%
  $}>CC$,L/%%
 %
 %
 %
!\ "))#w;;FFHHmmK00L$..r&   )r   FTFN)NNNF)r,   r-   r.   r/   r0   floatboolr   r   r   rC   rD   r   r
   tupler+   r1   r2   s   @r%   r   r     s_       GG  /3C CC C 	C
 C C C +,C C C C C CD 481526,13/ 3/|3/ #5<03/ !.	3/
 "%,/3/ $D>3/ -.3/ 
u|Xel3XeEL>Q5RR	S3/ 3/ 3/ 3/ 3/ 3/ 3/ 3/r&   r   c                   .     e Zd ZdZdef fdZd Z xZS )PatchMixerBlockzxThis module mixes the patch dimension.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r5   c                    t                                                       t          |          | _        |j        | _        |j        | _        t          |j        |j        |          | _        |j        r t          |j        |j                  | _
        |j        r=t          |j        |j        |j        |          | _        t          |          | _        d S d S )Nr   r   )r   r   ry   r5   )r   r   rh   rm   	self_attnr   rt   rM   r   r   r   r   r:   self_attn_headsry   self_attn_layer	norm_attnr=   s     r%   r   zPatchMixerBlock.__init__p  s    )&11	) +"*+
 
 
  	t :6CU`f`r s s sD 	;#8 . 0	$ $ $D  36::DNNN	; 	;r&   c                    |}|                      |          }| j        rY|j        \  }}}}|                    ||z  ||          }|                     |d          \  }}	}	|                    ||||          }|                    dd          }|                     |          }| j        r|                     |          }|                    dd          }| j        r| 	                    ||z             }||z   }
|
S )z
        Args:
            hidden_state (`torch.Tensor`): Input tensor.

        Returns:
            `torch.Tensor`: Transformed tensor.
        F)r   r?   r	   )
rm   r   rq   rp   r   r@   r   r   r   r   )r#   re   r   
batch_sizen_varsrM   r:   hidden_state_reshapedx_attn_r   s              r%   r+   zPatchMixerBlock.forward  s     yy..> 	N7C7I4JW$0$8$8f9Lk[b$c$c!//0EY^/__LFAq^^JWMMF $--a33xx--? 	;,,\::L $--a33> 	A>>,*?@@LX%
r&   r,   r-   r.   r/   r   r   r+   r1   r2   s   @r%   r   r   h  s^         ;1 ; ; ; ; ; ;4! ! ! ! ! ! !r&   r   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )FeatureMixerBlockzThis module mixes the hidden feature dimension.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.

    r5   c                    t                                                       t          |          | _        |j        | _        t          |j        |j        |          | _        |j        r"t          |j        |j                  | _	        d S d S r   )
r   r   rh   rm   r   rt   r:   r   r   r   r=   s     r%   r   zFeatureMixerBlock.__init__  s    )&11	 +"
 
 
  	l :6>\b\j k k kD	l 	lr&   hiddenc                     |}|                      |          }|                     |          }| j        r|                     |          }||z   }|S )
        Args:
            hidden (`torch.Tensor` of shape `(batch_size, num_patches, d_model)`):
                Input tensor to the layer.

        Returns:
            `torch.Tensor`: Transformed tensor.
        )rm   r   r   r   )r#   r   r   r   s       r%   r+   zFeatureMixerBlock.forward  sW     6""&!!? 	/&&v..Fx
r&   rB   r2   s   @r%   r   r     ss         l1 l l l l l l el        r&   r   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )PatchTSMixerLayerz
    The `PatchTSMixer` layer that does all three kinds of mixing.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.

    r5   c                     t                                                       t          |          | _        t	          |          | _        |j        | _        |j        dk    rt          |          | _        d S d S )Nr5   mix_channel)	r   r   r   patch_mixerr   feature_mixermoder   channel_feature_mixerr=   s     r%   r   zPatchTSMixerLayer.__init__  sw    *&999.f===K	;-'')MU[)\)\)\D&&& ('r&   r   c                     | j         dk    r|                     |          }|                     |          }|                     |          }|S )r   r   )r   r   r   r   )r#   r   s     r%   r+   zPatchTSMixerLayer.forward  sO     9%%//77F!!&))##F++r&   rB   r2   s   @r%   r   r     ss         	]1 	] 	] 	] 	] 	] 	]el        r&   r   c                   6     e Zd ZdZdef fdZddefdZ xZS )PatchTSMixerBlockzThe main computing framework of the `PatchTSMixer` model.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r5   c                     t                                                       j        }t          j        fdt          |          D                       | _        d S )Nc                 0    g | ]}t                     S )r   )r   ).0r   r5   s     r%   
<listcomp>z.PatchTSMixerBlock.__init__.<locals>.<listcomp>  s%    $a$a$a!%6f%E%E%E$a$a$ar&   )r   r   
num_layersr   
ModuleListrangemixers)r#   r5   r   r$   s    ` r%   r   zPatchTSMixerBlock.__init__	  sU    &
m$a$a$a$auU_O`O`$a$a$abbr&   Foutput_hidden_statesc                 v    g }|}| j         D ]$} ||          }|r|                    |           %|r||fS |dfS )as  
        Args:
            hidden_state (`torch.Tensor`): The input tensor.
            output_hidden_states (`bool`, *optional*, defaults to False.):
                Whether to output the hidden states as well.

        Returns:
            `torch.Tensor`: The embedding. `list`: List of all hidden states if `output_hidden_states` is set to
            `True`.
        N)r   append)r#   re   r   all_hidden_states	embeddingmods         r%   r+   zPatchTSMixerBlock.forward  sh      	; 	4 	4CII# 4!((333 	#///d?"r&   F)	r,   r-   r.   r/   r   r   r   r+   r1   r2   s   @r%   r   r     sv         c1 c c c c c c# #$ # # # # # # # #r&   r   c                   0     e Zd ZdZddef fdZd Z xZS )PatchTSMixerForPredictionHeadzqPrediction Head for Forecasting

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    Nr5   c                    t                                                       |j        | _        | j        | j                                         t	          j        |j                  | _        |-t	          j        |j	        |j
        z  |j                  | _        n'|                    |j	        |j
        z            | _        t	          j        d          | _        d S )N	start_dim)r   r   prediction_channel_indicessortr   rx   head_dropoutdropout_layerr   rM   r:   prediction_lengthbase_forecast_blockget_parameter_projectionFlattenflatten)r#   r5   distribution_outputr$   s      r%   r   z&PatchTSMixerForPredictionHead.__init__2  s    *0*K'*6+00222Z(;<<&')y&2Dv~2UX^Xp'q'qD$$':'S'S"V^3( (D$ zB///r&   c                                           |          }                     |          }                     |          }t          |t                    rt	          d |D                       }n|                    dd          } j        @t          |t                    rt	           fd|D                       }n|d j        f         }|S )ar  

        Args:
            hidden_features (`torch.Tensor` of shape `(batch_size, num_patch, d_model)` in `flatten` mode
                or `(batch_size, n_vars, num_patch, d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
                features.

        Returns:
            `torch.Tensor` of shape `(batch_size, prediction_length, nvars)`.

        c              3   B   K   | ]}|                     d d          V  dS )r   r   N)r@   )r   zs     r%   	<genexpr>z8PatchTSMixerForPredictionHead.forward.<locals>.<genexpr>U  s0      CCQQ[[R00CCCCCCr&   r   r   Nc              3   6   K   | ]}|d j         f         V  dS ).N)r   )r   r
  r#   s     r%   r  z8PatchTSMixerForPredictionHead.forward.<locals>.<genexpr>[  s0       [ [Q3(G#G!H [ [ [ [ [ [r&   .)r  r  r  
isinstancer   r@   r   r#   hidden_featuresforecasts   `  r%   r+   z%PatchTSMixerForPredictionHead.forwardD  s     ,,77,,_==++O<<h&& 	2CC(CCCCCHH))"b11H*6(E** J  [ [ [ [RZ [ [ [[[#C)H$HIr&   r(   r   r2   s   @r%   r   r   *  sc         0 01 0 0 0 0 0 0$      r&   r   c                   0     e Zd ZdZddef fdZd Z xZS )PatchTSMixerLinearHeadzLinear head for Classification and Regression.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    Nr5   c                    t                                                       |j        | _        |j        | _        |j        |j        }nd}|| _        |0t          j        |j        |j	        z  |z  |j
                  | _        n*|                    |j        |j	        z  |z            | _        |j        t          j        d          | _        nt          j        d          | _        t          j        |j                  | _        d S )Nr   r   r   )r   r   head_aggregationoutput_rangerM   r  r   r   r:   r   num_targets
projectionr  r  r  rx   r   ry   )r#   r5   r  
mul_factorr$   s       r%   r   zPatchTSMixerLinearHead.__init__j  s     & 7"/"*+JJJ#6 & i!::ZG" DOO
 2JJ!::ZG DO "*:333DLL:333DLz&"566r&   c                    |                     dd          }| j        dk    r	|d         }nH| j        dk    r|                    d          j        }n!| j        dk    r|                    d          }| j        r|                     |          }|                     |          }|                     |          }| j        E| j	        >t          j        |          | j	        d	         | j	        d
         z
  z  | j	        d
         z   }|S )ai  
        Args:
            hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
                or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
                features.

        Returns:
            `torch.Tensor` of shape `(batch_size x num_targets)`.
        r   r   use_last).r   max_poolr   avg_poolNr   r   )r@   r  maxvaluesr^   r  ry   r  r  r  rC   sigmoid)r#   r  s     r%   r+   zPatchTSMixerLinearHead.forward  s    *33B;; J..-g6OO"j00-11b199@OO"j00-22r2::O< 	<"ll?;;O,,77///::$,43D3Po..$2CA2FIZ[\I]2]^aearstauu  r&   r(   r   r2   s   @r%   r  r  b  sc         7 71 7 7 7 7 7 78             r&   r  c                   ,    e Zd ZU eed<   dZdZdZd ZdS )PatchTSMixerPreTrainedModelr5   modelpast_valuesFc                    t          |t                    r;| j        j        dk    r)t          j                            |j        dd           dS dS t          |t          j        t          j	        f          r?|j
        j                                         |j        j                            d           dS t          |t                    rI|j        j
        j                                         |j        j        j                            d           dS t          |t          j                  rR|j        j                            d| j        j                   |j
        "|j
        j                                         dS dS dS )zInitialize weightsrP   r   g?)r^   r_         ?N)r  rF   r5   rU   r   initnormal_rJ   rn   r9   r   datazero_weightfill_r4   r<   r   init_std)r#   r   s     r%   _init_weightsz)PatchTSMixerPreTrainedModel._init_weights  s_   f<== 	){3x?? 3#3GGGGG @?r~ >?? 		)K""$$$M$$S))))) 566 	)!&,,...#(..s33333	** 	)M&&CT[5I&JJJ{& &&(((((	) 	)&&r&   N)	r,   r-   r.   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr.   r&   r%   r"  r"    sD          #O&+#) ) ) ) )r&   r"  c                   .     e Zd ZdZdef fdZd Z xZS )PatchTSMixerPretrainHeadzcPretraining head.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r5   c                     t                                                       t          j        |j                  | _        t          j        |j        |j                  | _	        d S r(   )
r   r   r   rx   r   r  r   r:   patch_lengthbase_pt_blockr=   s     r%   r   z!PatchTSMixerPretrainHead.__init__  sM    Z(;<<Yv~v7JKKr&   c                 Z    |                      |          }|                     |          }|S )a  
        Args:
            hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
                or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
                features.

        Returns:
            `torch.Tensor` of shape `(batch_size x n_vars x num_patch x patch_length)`.
        )r  r8  r  s      r%   r+   z PatchTSMixerPretrainHead.forward  s/     ,,_==%%o66r&   r   r2   s   @r%   r5  r5    se         L1 L L L L L L      r&   r5  Fr)   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                    |dk     s|dk    rt          d| d          | j        \  }}}}| j        }	t          |d|z
  z            }
|r0t	          j        |d||	          }|                    d|d          }nt	          j        ||||	          }t	          j        ||||	          }d|ddddd|
f<   t	          j        |d          }t	          j        |d          }t	          j	        |d|	          }|
                    d                              ddd|          }|d|dd|ddddf<   |                     |                                |          }||d
         fS )a  random_masking: Mask the input considering the control variables.

    Args:
        inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
            The input tensor to mask.
        mask_ratio (`float`):
            Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
        unmasked_channel_indices (list, *optional*):
            Indices of channels that will not be masked.
        channel_consistent_masking (bool, *optional*, defaults to `False`):
            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
            across channels.
        mask_value (int, *optional*, defaults to 0):
            Define the value of masked patches for pretraining.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
        n]
    r   r   zMask ratio z has to be between 0 and 1.deviceNr   r   )r   index.r   )r`   rq   r@  r0   rC   randrepeatonesargsortgatherrX   masked_fillr   )r)   r:  r;  r<  r=  r   num_channelssequence_lengthnum_featuresr@  len_keepnoisemaskids_shuffleids_restoreinputs_masks                   r%   random_maskingrR    s   4 A~~qNzNNNOOO>Dl;Jo|]F?a*n566H! U
:q/&IIIQa00 
:|_VTTT :j,OOODDAAAyy -2...K-444K<"K888D>>"$$Q1l;;D+23QQQ(!!!QQQ./$$TYY[[*==KV$$r&   num_forecast_mask_patchesc                    t          |t                    r|g}d |D             }| j        \  }}}}t          j        |||| j                  }	g }
d}t          |          }t          ||          D ]V\  }}|dk    s||k    rt          d| d          t          ||z  |z            }|
	                    |||g           ||z  }Wt          |
d           }
||k     r|
d         d         ||z
  z   |
d         d<   n#||k    r|
d	         d         ||z
  z   |
d	         d<   d}|
D ]\  }}}||z   }d
|	||dd| df<   |}t          j        |	j        d                   }|	|         }	|	                    d	                              d
d
d
|          }	|d|	dd|ddddf<   |                     |	                                |          }||	d         fS )a  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
    If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

    Parameters:
        inputs (`torch.Tensor`):
            Input of shape `(bs, num_channels, num_patch, patch_length)`
        num_forecast_mask_patches (`list`):
            Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
        unmasked_channel_indices (`list`, *optional*):
            Indices of channels that are not masked.
        mask_value (`int`, *optional*, defaults to 0):
            Values in the masked patches will be filled by `mask_value`.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
        num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
    c                     g | ]}d S )r   r3  )r   r   s     r%   r   z$forecast_masking.<locals>.<listcomp>7  s    AAA!AAAAr&   r?  r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                     | d         S Nr?   r3  )xs    r%   <lambda>z"forecast_masking.<locals>.<lambda>I  s
    !A$ r&   )r   r?   r   r   NrB  )r  r0   rq   rC   rL   r@  sumzipr`   r   sortedrandpermrX   rD  rH  r   )r)   rS  r;  r=  forecast_mask_ratiosr   rI  rJ  rK  rN  t_listtotal_lengthtotal_ratior7  ratiotemp_lenbatch1	patch_lenr   batch2permrQ  s                         r%   forecast_maskingrh    sU   0 +S11 @%>$?!AA'@AAA>Dl;Jo|;z<WWWDFL*++K"#<>RSS ! !e1 ? ?q\qqq   zE)K788|UH5666 F///Fj  ay|zL'@Aq	!	
	"	"r
1
)BCr
1F"(  	1h("./VF]AAA	z{{*+>$*Q-((D:D>>"$$Q1l;;D+23QQQ(!!!QQQ./$$TYY[[*==KV$$r&   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )PatchTSMixerPatchifyz
    A class to patchify the time series sequence into different patches

    Returns:
        `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
    r5   c                    t                                                       |j        | _        |j        | _        |j        | _        | j        | j        k    r t          d| j         d| j         d          t          | j        | j                  | j        z
  | j        z  dz   | _        | j        | j        | j        dz
  z  z   }| j        |z
  | _	        d S )NzSequence length (z+) has to be greater than the patch length ()r   )
r   r   context_lengthrJ  r7  patch_strider`   r  rM   sequence_start)r#   r5   new_sequence_lengthr$   s      r%   r   zPatchTSMixerPatchify.__init__j  s    %4"/"/4#444yD$8yyeievyyy  
   4d6GHH4K\\aearruvv"/$2CtGWZ[G[2\\"25HHr&   r$  c                 ,   |j         d         }|| j        k    rt          d| d| j         d          |dd| j        dddf         }|                    d| j        | j                  }|                    dd                                          }|S )a!  
        Parameters:
            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
                Input for patchification

        Returns:
            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
        r   zInput sequence length (z%) doesn't match model configuration (r   N)	dimensionr   stepr  )	rq   rJ  r`   ro  unfoldr7  rn  r@   r   )r#   r$  rJ  rA   s       r%   r+   zPatchTSMixerPatchify.forward{  s     &+B/d222x/xx`d`txxx   QQQ 3 5 5qqq89$2C$J[\\!!"b))4466r&   rB   r2   s   @r%   rj  rj  b  ss         I1 I I I I I I"5<        r&   rj  c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )PatchTSMixerMaskinga  
    Class to perform random or forecast masking.

    Parameters:
        config (`PatchTSMixerConfig`): model config
    Returns:
        x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
            Masked patched input
        mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
            Bool tensor indicating True on masked points
    r5   c                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        | j        t          | j                  | _        d S d S r(   )	r   r   random_mask_ratior<  	mask_typerS  r;  r=  r\  r=   s     r%   r   zPatchTSMixerMasking.__init__  s    !'!9*0*K'))/)I&(.(G% +(4,243P,Q,QD))) 54r&   rc   c                 2   | j         dk    r,t          || j        | j        | j        | j                  \  }}nI| j         dk    r&t          || j        | j        | j                  \  }}nt          d| j          d          |	                                }||fS )a  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Patch input

        Return:
            masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
                Masked patched input
            mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
                Bool tensor indicating True on masked points

        rP   )r)   r:  r;  r<  r=  r  )r)   rS  r;  r=  zInvalid mask type .)
ry  rR  rx  r;  r<  r=  rh  rS  r`   r   )r#   rc   masked_inputrN  s       r%   r+   zPatchTSMixerMasking.forward  s     >X%%!/"1)-)F+/+J?" " "L$$ ^z))!1"*.*H)-)F?	" " "L$$ C$.CCCDDD yy{{T!!r&   rB   r2   s   @r%   rv  rv    ss        
 
	R1 	R 	R 	R 	R 	R 	R!"5< !" !" !" !" !" !" !" !"r&   rv  c            	            e Zd ZdZdef fdZdej        dej        deej        ej        ej        f         fdZ	 xZ
S )PatchTSMixerStdScalerz
    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
    subtracting from the mean and dividing by the standard deviation.
    r5   c                     t                                                       t          |d          r|j        nd| _        t          |d          r|j        nd| _        t          |d          r|j        nd| _        d S )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r   r   hasattrr  r   r  r  r=   s     r%   r   zPatchTSMixerStdScaler.__init__  sy    )0)G)GN6%%Q)0)C)CMv~~5<V_5U5U_V11[_r&   r)  observed_indicatorrN   c                 d   |                     | j        | j                  }|                    d          }||z                       | j        | j                  |z  }||z
  |z  dz                       | j        | j                  |z  }t	          j        || j        z             }||z
  |z  ||fS )C  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        r  r&  r?   )rZ  r   r  	clamp_minrC   sqrtr  )r#   r)  r  denominatorlocvariancescales          r%   r+   zPatchTSMixerStdScaler.forward  s     ),,TXt|,LL!++C00((--dh-MMP[[Sj$661<AA$(TXT`Aaadoo
8d&8899s
e#S%//r&   r,   r-   r.   r/   r   r   rC   rD   r   r+   r1   r2   s   @r%   r~  r~    s         
`1 ` ` ` ` ` `0L06;l0	u|U\5<7	80 0 0 0 0 0 0 0r&   r~  c            	            e Zd ZdZdef fdZdej        dej        deej        ej        ej        f         fdZ	 xZ
S )PatchTSMixerMeanScalerz
    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
    accordingly.
    r5   c                 8   t                                                       t          |d          r|j        nd| _        t          |d          r|j        nd| _        t          |d          r|j        nd| _        t          |d          r|j        nd | _        d S )Nr  r   r  Tr  绽|=default_scale)r   r   r  r  r   r  r  r  r=   s     r%   r   zPatchTSMixerMeanScaler.__init__  s    )0)G)GN6%%Q)0)C)CMv~~5<V_5U5U`V11[`5<V_5U5U_V11[_r&   r)  r  rN   c                    ||z                                                       | j        d          }|                    | j        d          }|t          j        |d          z  }| j        W|                    d          }t          j        |                    d          d          }t          j        ||z            }n| j        t          j        |          z  }t          j        |dk    ||          }t          j        || j	                  }||z  }	| j
        s|                    | j                  }|	t          j        |          |fS )r  Tr  r   minNr   r   )absrZ  r   rC   clampr  squeeze	ones_likewherer  r  
zeros_like)
r#   r)  r  ts_sumnum_observedr  	batch_sumbatch_observationsr  scaled_datas
             r%   r+   zPatchTSMixerMeanScaler.forward  sE    ++002266tx6NN)--dh-EE\q9999 %

q
))I!&\-=-=a-@-@a!H!H!H!M)6H*HIIMM .1G1GGM L1,e]CC Et'9:::Ul| 	0MMdhM//EE,U33U::r&   r  r2   s   @r%   r  r    s         
`1 ` ` ` ` ` `&;L&;6;l&;	u|U\5<7	8&; &; &; &; &; &; &; &;r&   r  c            
            e Zd ZdZdef fdZ	 d	dej        deej                 de	ej        ej        ej        f         fdZ
 xZS )
PatchTSMixerNOPScalerz|
    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
    r5   c                     t                                                       t          |d          r|j        nd| _        t          |d          r|j        nd| _        d S )Nr  r   r  T)r   r   r  r  r   r  r=   s     r%   r   zPatchTSMixerNOPScaler.__init__0  sW    )0)G)GN6%%Q)0)C)CMv~~r&   Nr)  r  rN   c                     t          j        |d                              | j        | j                  }t          j        |d                              | j        | j                  }|||fS )a  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        FrQ   )r   r  )rC   r  r^   r   r  r  )r#   r)  r  r  r  s        r%   r+   zPatchTSMixerNOPScaler.forward5  sl     E:::??DHVZVb?cct5999>>48UYUa>bbS%r&   r(   )r,   r-   r.   r/   r   r   rC   rD   r   r   r+   r1   r2   s   @r%   r  r  +  s         N1 N N N N N N PT   L 6>u|6L 	u|U\5<7	8               r&   r  zS
    Base class for `PatchTSMixerEncoderOutput`, with potential hidden states.
    )custom_introc                   h    e Zd ZU dZdZeej                 ed<   dZ	ee
ej                          ed<   dS )PatchTSMixerEncoderOutputa-  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, d_model)`):
        Hidden-state at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer.
    Nlast_hidden_stater   )r,   r-   r.   r/   r  r   rC   FloatTensorr/  r   r   r3  r&   r%   r  r  F  sT           6:x 129998<M8E%"345<<<<<r&   r  c                        e Zd ZdZdef fdZe	 	 ddej        de	e
         de	e
         d	eeef         fd
            Z xZS )PatchTSMixerEncoderz
    Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r5   c                 V   t                                          |           |j        | _        t          j        |j        |j                  | _        |j        rt          |          | _
        nd | _
        t          |          | _        |j        r|                                  d S d S )Nr   )r   r   use_return_dictr   r   r7  r:   patcherrH   rF   positional_encoderr   mlp_mixer_encoder	post_initr=   s     r%   r   zPatchTSMixerEncoder.__init__a  s       %5y!4fnEE) 	+&DF&S&S&SD##&*D#!2&!A!A!A  	NN	 	r&   FNr$  r   return_dictrN   c                 
   ||n| j         }|                     |          }| j        |                     |          }|                     ||          \  }}|st	          d ||fD                       S t          ||          S )a  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to
            predict the masked portion. For a forecasting task, this denotes the history/past time series values.
            Similarly, for classification or regression tasks, it denotes the appropriate context values of the
            time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series,
            it is greater than 1.

        Returns:
            `torch.FloatTensor` of shape `(batch_size, n_vars, num_patches, d_model)`
        N)r   c              3      K   | ]}|V  d S r(   r3  r   vs     r%   r  z.PatchTSMixerEncoder.forward.<locals>.<genexpr>  s6              r&   )r  r   )r  r  r  r  r   r  )r#   r$  r   r  patchesr  r   s          r%   r+   zPatchTSMixerEncoder.forwardq  s    * &1%<kk$BV ,,{++ ".--g66G+/+A+A'`t+A+u+u(= 	   &!      );L\ijjjjr&   )FN)r,   r-   r.   r/   r   r   r   rC   rD   r   r   r   r   r  r+   r1   r2   s   @r%   r  r  X  s         1         05&*	(k (k\(k 'tn(k d^	(k
 
u//	0(k (k (k ^(k (k (k (k (kr&   r  zG
    Base class for model's outputs, with potential hidden states.
    c                       e Zd ZU dZdZeej                 ed<   dZ	ee
ej                          ed<   dZeej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeej                 ed<   dS )	PatchTSMixerModelOutputa  
    last_hidden_state (`torch.FloatTensor`  of shape `(batch_size, num_channels, num_patches, d_model)`):
        Hidden-state at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer.
    patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
        Patched input data to the model.
    mask (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`, *optional*):
        Bool Tensor indicating True in masked patches and False otherwise.
    loc (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
        Gives the mean of the context window per channel. Used for revin denorm outside the model, if revin
        enabled.
    scale (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
        Gives the std dev of the context window per channel. Used for revin denorm outside the model, if revin
        enabled.
    Nr  r   rc   rN  r  r  )r,   r-   r.   r/   r  r   rC   r  r/  r   r   rc   rN  r  r  r3  r&   r%   r  r    s          " 6:x 129998<M8E%"345<<</3K%+,333(,D(5$
%,,,'+C%#	$+++)-E8E%&-----r&   r  z=
    The PatchTSMixer Model for time-series forecasting.
    c                        e Zd Zddedef fdZe	 	 	 ddej        de	ej                 de	e         d	e	e         d
e
f
d            Z xZS )PatchTSMixerModelFr5   
mask_inputc                    t                                          |           |j        | _        t          |          | _        t          |          | _        |du rt          |          | _        nd| _        |j	        dk    rt          |          | _        n=|j	        dk    s	|j	        du rt          |          | _        nt          |          | _        |j        r|                                  dS dS )z
        mask_input (bool, *optional*, defaults to `False`):
            Whether to mask the input using the [`PatchTSMixerMasking`] module.
        TNr^   r_   )r   r   r  r  encoderrj  patchingrv  maskingr   r  scalerr~  r  r  )r#   r5   r  r$   s      r%   r   zPatchTSMixerModel.__init__  s    
 	   %5*622,V44.v66DLLDL>V##088DKK^u$$$(>(>/77DKK/77DK  	NN	 	r&   Nr$  observed_maskr   r  rN   c           	         ||n| j         }d}|t          j        |          }|                     ||          \  }}}|                     |          }	|	}
| j        |                     |	          \  }
}|                     |
||          }t          |t                    r	t          | }|s)t          d |j
        |j        |	|||fD                       S t          |j
        |j        |	|||          S )a  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to predict
            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
            for classification or regression tasks, it denotes the appropriate context values of the time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
            greater than 1.
        observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:
            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
        Nr   r  c              3      K   | ]}|V  d S r(   r3  r  s     r%   r  z,PatchTSMixerModel.forward.<locals>.<genexpr>  6       
 
 
 
 
 
 
 
r&   )r  r   rc   rN  r  r  )r  rC   r  r  r  r  r  r  r   r  r  r   r  )r#   r$  r  r   r  rN  scaled_past_valuesr  r  	patched_x	enc_inputencoder_outputs               r%   r+   zPatchTSMixerModel.forward  sC   , &1%<kk$BV !OK88M)-[-)P)P&CMM"455		<#"ll955OIt !5# & 
 
 ne,, 	H6GN 	 
 
 #4"0
 
 
 
 
 
 ',>(6!
 
 
 	
r&   r   )NFN)r,   r-   r.   r   r   r   r   rC   rD   r   r  r+   r1   r2   s   @r%   r  r    s         1 t      6  15/4&*A
 A
\A
  -A
 'tn	A

 d^A
 
!A
 A
 A
 ^A
 A
 A
 A
 A
r&   r  z>
    Output type of [`PatchTSMixerForPreTrainingOutput`].
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeeej                          ed<   dS ) PatchTSMixerForPreTrainingOutputa@  
    loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
        Total loss
    prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, patch_length)`):
        Prediction output from the pretrain head.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
        Backbone embeddings before passing through the head.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer.
    Nlossprediction_outputsr  r   r,   r-   r.   r/   r  r   rC   r  r/  r  r  r   r   r3  r&   r%   r  r  #           	 	 )-D(5$
%,,,6:!23:::59x 129998<M8E%"345<<<<<r&   r  z.
    `PatchTSMixer` for mask pretraining.
    c                        e Zd Zdef fdZe	 	 	 	 ddej        deej                 dee	         d	e	d
ee	         de
fd            Z xZS )PatchTSMixerForPretrainingr5   c                 
   t                                          |           t          |d          | _        t	          |          | _        |j        | _        |j        | _        |j        r|                                  d S d S )NT)r  r   )	r   r   r  r#  r5  headmasked_lossr  r  r=   s     r%   r   z#PatchTSMixerForPretraining.__init__A  s       &v$???
,F;;;	!-%5  	NN	 	r&   NFTr$  r  r   return_lossr  rN   c                    ||n| j         }| j        du r!t          j                            d          }n t          j                            d          }|                     ||||          }t          |t                    r	t          | }| 	                    |j
                  }|du r |||j                  }	nd}	| j        du rO|	M|	                    d          |j        z                                  |j                                        d	z   z  }	|s't          d
 |	||j
        |j        fD                       S t!          |	||j
        |j                  S )aT  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to predict
            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
            for classification or regression tasks, it denotes the appropriate context values of the time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
            greater than 1.
        observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:
            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
        return_loss (`bool`,  *optional*):
            Whether to return the loss in the `forward` call.
        NTnone	reductionr^   r  r   r  r   r   r  c              3      K   | ]}|V  d S r(   r3  r  s     r%   r  z5PatchTSMixerForPretraining.forward.<locals>.<genexpr>  6              r&   r  r  r  r   )r  r  rC   r   MSELossr#  r  r   r  r  r  rc   r^   rN  rZ  r   r  )
r#   r$  r  r   r  r  r  model_outputx_hatloss_vals
             r%   r+   z"PatchTSMixerForPretraining.forwardL  s   2 &1%<kk$BVt##8##f#55DD8##f#55D zz'!5#	 " 
 
 lE** 	B2LAL		,899$tE<#;<<HHH t##(< "--0AAFFHHLL]LaLaLcLcfkLklH 		    2 .	      0$*<&4	
 
 
 	
r&   NFTN)r,   r-   r.   r   r   r   rC   rD   r   r   r  r+   r1   r2   s   @r%   r  r  ;  s        	1 	 	 	 	 	 	  15/4 &*D
 D
\D
  -D
 'tn	D

 D
 d^D
 
*D
 D
 D
 ^D
 D
 D
 D
 D
r&   r  z=
    Output type of [`PatchTSMixerForPredictionOutput`].
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeeej                          ed<   dZeej                 ed<   dZeej                 ed<   dS )	PatchTSMixerForPredictionOutputaD  
    loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
        Total loss.
    prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_input_channels)`):
        Prediction output from the forecast head.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
        Backbone embeddings before passing through the head.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    loc (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
        Input mean
    scale (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
        Input std dev
    Nr  r  r  r   r  r  )r,   r-   r.   r/   r  r   rC   r  r/  r  r  r   r   r  r  r3  r&   r%   r  r    s           )-D(5$
%,,,6:!23:::59x 129998<M8E%"345<<<'+C%#	$+++)-E8E%&-----r&   r  z
    Base class for time series model's predictions outputs that contains the sampled values from the chosen
    distribution.
    c                   8    e Zd ZU dZdZeej                 ed<   dS )"SamplePatchTSMixerPredictionOutput
    sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length, number_channels)`):
        Sampled values from the chosen distribution.
    N	sequences	r,   r-   r.   r/   r  r   rC   r  r/  r3  r&   r%   r  r    6          
 .2Ix)*11111r&   r  c                   8    e Zd ZU dZdZeej                 ed<   dS )"SamplePatchTSMixerRegressionOutputr  Nr  r  r3  r&   r%   r  r    r  r&   r  inputtargetrN   c                 .    |                      |           S )zc
    Computes the negative log likelihood loss from input distribution with respect to target.
    )log_prob)r  r  s     r%   nllr    s     NN6""""r&   input_tensorweightsc                 n   |t          j        |dk    | |z  t          j        |                     }t          j        |r|                    |          n|                                d          }|r|                    |          n|                                |z  S |                     |          S )aj  
    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

    Args:
        input_tensor (`torch.FloatTensor`):
            Input tensor, of which the average must be computed.
        weights (`torch.FloatTensor`, *optional*):
            Weights tensor, of the same shape as `input_tensor`.
        dim (`int`, *optional*):
            The dim along which to average `input_tensor`.

    Returns:
        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
    Nr   r   r&  r  )rC   r  r  r  rZ  r^   )r  r  r   weighted_tensorsum_weightss        r%   weighted_averager    s      +glL74JEL\]iLjLjkkk#"P'++#+"6"6"67;;==VYZZZ03N###,,,9L9L9N9NR]]]  S )))r&   c                       e Zd ZdZdef fdZe	 	 	 	 	 ddej        de	ej                 d	e	ej                 d
e	e
         de
de	e
         defd            Z ej                    	 ddej        de	ej                 defd            Z xZS )PatchTSMixerForPredictionz
    `PatchTSMixer` for forecasting application.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.

    Returns:
        `None`.
    r5   c                 ,   t                                          |           |j        | _        |j        | _        |j        | _        |j        | _        |j        dk    rd | _        nb|j        }t          t          t          d}|                    |j        d           }| ||          | _        nt          d|j                   t          |          | _        t          || j                  | _        |j        r|                                  d S d S )Nmse	student_tnormalnegative_binomialr   Unknown distribution output r5   r  )r   r   r  r  r   num_parallel_samplesr  r  r   r   r   getr`   r  r#  r   r  r  )r#   r5   r   distribution_output_mapoutput_classr$   s        r%   r   z"PatchTSMixerForPrediction.__init__  s)      K	%5*0*K'$*$?!;%'+D$$*C+&%;' '#
 366v7QSWXXL'+7<C+@+@+@(( !\@Z!\!\]]]&v..
1 $ 8
 
 
	  	NN	 	r&   NFTr$  r  future_valuesr   r  r  rN   c           	         | j         dk    rt          j        d          }n"| j         dk    rt          }nt	          d          ||n| j        }|                     ||||          }t          |t                    r	t          | }| 
                    |j                  }	d}
| j        | j        rp| j                            |	|j        d| j        f         |j        d| j        f         	          }|,|d
u r( |||d| j        f                   }
t#          |
          }
n|	|j        d| j        f         z  |j        d| j        f         z   }	||d
u r ||	|d| j        f                   }
nt| j        rI| j                            |	|j        |j        	          }||d
u r |||          }
t#          |
          }
n$|	|j        z  |j        z   }	||d
u r ||	|          }
| j        )|j        d| j        f         }|j        d| j        f         }n|j        }|j        }|s)t          d |
|	|j        |j        ||fD                       S t'          |
|	|j        |j        ||          S )a  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to predict
            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
            for classification or regression tasks, it denotes the appropriate context values of the time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
            greater than 1.
        observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:
            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
        future_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,:
            `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
            Target values of the time series, that serve as labels for the model. The `future_values` is what the
            Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
            required for a pretraining task.

            For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
            to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
            pass the target data with all channels, as channel Filtering for both prediction and target will be
            manually applied before the loss computation.
        return_loss (`bool`,  *optional*):
            Whether to return the loss in the `forward` call.
        r  r^   r  r  2Invalid loss function: Allowed values: mse and nllNr  .r  r  Tc              3      K   | ]}|V  d S r(   r3  r  s     r%   r  z4PatchTSMixerForPrediction.forward.<locals>.<genexpr>  r  r&   )r  r  r  r   r  r  )r  r   r  r  r`   r  r#  r  r   r  r  r  r   r  distributionr  r  r  r   r  )r#   r$  r  r  r   r  r  r  r  y_hatr  r  r  r  s                 r%   r+   z!PatchTSMixerForPrediction.forward  s*   H 9:///DDY%DDQRRR%0%<kk$BV zz'!5#	 " 
 
 lE** 	B2LAL 		,899*6' `#7DD$(d.M)MN&,S$2Q-QR  E    
 !,1D1D#t$%c4+J&JK   H
  099H L.sD4S/STT"&sD,K'KLM  !,1D1D#tE=d>]9]+^__H' 
:#7DD|/|7I  E     !,1D1D#tL-@@H/99H 22\5EE ,1D1D#tE=99H*6"3(G#GHC &sD,K'KLEE"C &E 	 
 
  2 .
 
 
 
 
 
 /$*<&4
 
 
 	
r&   c                 
   | j         } | |d|d          }| j                            |j        |j        |j                  fdt          |          D             }t          j        |d          }t          |          S )	a  
        Generate sequences of sample predictions from a model with a probability distribution head.

        Args:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the future.

            observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Return:
            [`SamplePatchTSMixerPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
            number of samples, prediction_length, num_input_channels)`.
        NF)r$  r  r  r   r  c                 8    g | ]}                                 S r3  sampler   r   r  s     r%   r   z6PatchTSMixerForPrediction.generate.<locals>.<listcomp>  s%    NNNQ<&&((NNNr&   r   r   r  )
r  r  r  r  r  r  r   rC   stackr  )r#   r$  r  r  outputssamplesr  s         @r%   generatez"PatchTSMixerForPrediction.generate  s    2  $8 $#'!&	
 
 
 /<<&GKw} = 
 

 ONNN%8L2M2MNNN +g1---1GDDDDr&   )NNFTNr(   )r,   r-   r.   r/   r   r   r   rC   rD   r   r   r  r+   no_gradr  r  r1   r2   s   @r%   r  r    sR       	 	1      @  1504/4 &*w
 w
\w
  -w
  -	w

 'tnw
 w
 d^w
 
)w
 w
 w
 ^w
r U]__ 15-E -E\-E  --E 
,	-E -E -E _-E -E -E -E -Er&   r  zK
    Output type of [`PatchTSMixerForTimeSeriesClassificationOutput`].
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeeej                          ed<   dS )-PatchTSMixerForTimeSeriesClassificationOutputaP  
    loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
        Total loss.
    prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
        Prediction output from the classification head.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
        Backbone embeddings before passing through the head.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Nr  r  r  r   r  r3  r&   r%   r  r    r  r&   r  c                        e Zd ZdZdef fdZe	 	 	 	 ddej        de	ej                 d	e	e
         d
e
de	e
         defd            Z xZS )'PatchTSMixerForTimeSeriesClassificationz
    `PatchTSMixer` for classification application.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.

    Returns:
        `None`.
    r5   c                 P   t                                          |           t          |          | _        t	          |          | _        |j        | _        |j        dv r!t          |j	        |j
                  | _        nd | _        |j        r|                                  d S d S )Nr   r_   r^   Tr:   rM   )r   r   r  r#  r  r  r  r   InjectScalerStatistics4Dr:   rM   inject_scaler  r=   s     r%   r   z0PatchTSMixerForTimeSeriesClassification.__init__  s       &v..
*
 
 
	  &5>222 8]c]o p p pD $D  	NN	 	r&   NFTr$  target_valuesr   r  r  rN   c                    t           j                                        }||n| j        }|                     |||          }t          |t                    r	t          | }| j        ,|                     |j	        |j
        |j                  |_	        |                     |j	                  }||du r |||          }	nd}	|s't          d |	||j	        |j        fD                       S t          |	||j	        |j                  S )a  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to predict
            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
            for classification or regression tasks, it denotes the appropriate context values of the time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
            greater than 1.
        target_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
            `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
            Target
            values of the time series, that serve as labels for the model. The `target_values` is what the
            Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
            required for a pretraining task.

            For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
            to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
            pass the target data with all channels, as channel Filtering for both prediction and target will be
            manually applied before the loss computation.

            For a classification task, it has a shape of `(batch_size,)`.

            For a regression task, it has a shape of `(batch_size, num_targets)`.
        return_loss (`bool`, *optional*):
            Whether to return the loss in the `forward` call.
        Nr  r  Tc              3      K   | ]}|V  d S r(   r3  r  s     r%   r  zBPatchTSMixerForTimeSeriesClassification.forward.<locals>.<genexpr>>  r  r&   r  )rC   r   CrossEntropyLossr  r#  r  r   r  r"  r  r  r  r  r   r  )
r#   r$  r#  r   r  r  r  r  r  r  s
             r%   r+   z/PatchTSMixerForTimeSeriesClassification.forward  sV   H x((**%0%<kk$BVzz!5# " 
 

 lE** 	B2LAL(-1->->. $"( .? . .L* 		,899$)<)<tE=11HHH 		    2 .	      =$*<&4	
 
 
 	
r&   r  )r,   r-   r.   r/   r   r   r   rC   rD   r   r   r  r+   r1   r2   s   @r%   r  r    s        	 	1      "  15/4 &*M
 M
\M
  -M
 'tn	M

 M
 d^M
 
7M
 M
 M
 ^M
 M
 M
 M
 M
r&   r  z=
    Output type of [`PatchTSMixerForRegressionOutput`].
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeeej                          ed<   dS )PatchTSMixerForRegressionOutputaM  
    loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
        Total loss.
    regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
        Prediction output from the regression head.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
        Backbone embeddings before passing through the head.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Nr  regression_outputsr  r   )r,   r-   r.   r/   r  r   rC   r  r/  r)  r  r   r   r3  r&   r%   r(  r(  P  r  r&   r(  c                   `     e Zd Zd
dededef fdZdej        dej        dej        fd	Z xZS )r!  r?   r:   rM   	expansionc                 D   t                                                       t          j        |dz   ||z            | _        t          j        ||z  |          | _        t          j        dd|z            | _        t          j        d|z  d          | _        || _        d S rW  )	r   r   r   r   inverse_trans_expansioninverse_trans_compressionmap_scale_expansionmap_scale_compressionrM   )r#   r:   rM   r+  r$   s       r%   r   z!InjectScalerStatistics4D.__init__i  s    ')y1i'>Q'R'R$)+9w3F)P)P&#%9QI#>#> %'Yq9}a%@%@"&r&   r)   r  r  c                 .   |                     dd          }|                    d          }|                    dd| j        d          }|                     dd          }|                    d          }|                    dd| j        d          }t	          j        ||gd          }|                     |          }|                     |          }t	          j        ||gd          }|                     |          }| 	                    |          }|S )a  
        Args:
            inputs (`torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`)
            loc (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
            scale (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
        Returns:
            `torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`
        r   r   r   r   )
r@   rX   rD  rM   rC   catr/  r0  r-  r.  )r#   r)   r  r  r^   stdevconcat_statss          r%   r+   z InjectScalerStatistics4D.forwardr  s    }}R$$~~b!!{{1a!1155B''##Q4#3Q77y$B777//==11,??FL1r:::--f55//77r&   )r?   )	r,   r-   r.   r0   r   rC   rD   r+   r1   r2   s   @r%   r!  r!  h  s        ' ' '# '# ' ' ' ' ' 'el  el        r&   r!  z4
    `PatchTSMixer` for regression application.
    c                        e Zd Zdef fdZe	 	 	 	 ddej        deej                 dee	         d	e	d
ee	         de
fd            Z ej                    dej        defd            Z xZS )PatchTSMixerForRegressionr5   c                    t                                          |           t          |          | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        dk    rd | _        n_t          t          t          d}|                    |j                  }| ||j                  | _        nt          d|j                   |j        dv r!t          |j        |j                  | _        nd | _        t'          || j                  | _        |j        r|                                  d S d S )Nr  r  r   r  r  r   r  )r   r   r  r#  r  r  r  r  r   r   r   r  r  r`   r   r!  r:   rM   r"  r  r  r  )r#   r5   r  r  r$   s       r%   r   z"PatchTSMixerForRegression.__init__  sW      &v..
K	#)#= %5$*$?!;%'+D$$ ,&%;' '#
 366v7QRRL'+7<F<N+O+O+O(( !\@Z!\!\]]]>222 8]c]o p p pD $D* $ 8
 
 
	  	NN	 	r&   NFTr$  r#  r   r  r  rN   c                 p     j         dk    rt          j        d          }n" j         dk    rt          }nt	          d          ||n j        }                     |||          }t          |t                    r	t          | } j
        , 
                    |j        |j        |j                  |_                             |j                  }||d	u r j        r j        d
k    r't!          j        |dk               rt%          d           j                            |          }	t           fd|D                       } ||	|          }
t)          |
          }
n |||          }
nd}
|s't          d |
||j        |j        fD                       S t-          |
||j        |j                  S )a  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to predict
            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
            for classification or regression tasks, it denotes the appropriate context values of the time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
            greater than 1.
        target_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
            `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
            Target values of the time series, that serve as labels for the model. The `target_values` is what the
            Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
            required for a pretraining task.

            For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
            to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
            pass the target data with all channels, as channel Filtering for both prediction and target will be
            manually applied before the loss computation.

            For a classification task, it has a shape of `(batch_size,)`.

            For a regression task, it has a shape of `(batch_size, num_targets)`.
        return_loss (`bool`, *optional*):
            Whether to return the loss in the `forward` call.
        r  r^   r  r  r
  Nr  r  Tr  r   zDtarget_values cannot be negative for negative_binomial distribution.c              3   X   K   | ]$}|                     d j        j                  V  %dS )r   N)r   r5   r  )r   itemr#   s     r%   r  z4PatchTSMixerForRegression.forward.<locals>.<genexpr>  s6      WWdiiDK,CDDWWWWWWr&   c              3      K   | ]}|V  d S r(   r3  r  s     r%   r  z4PatchTSMixerForRegression.forward.<locals>.<genexpr>	  r  r&   )r  r)  r  r   )r  r   r  r  r`   r  r#  r  r   r  r"  r  r  r  r  r  rC   any	Exceptionr  r  r   r(  )r#   r$  r#  r   r  r  r  r  r  r  r  s   `          r%   r+   z!PatchTSMixerForRegression.forward  s    F 9:///DDY%DDQRRR%0%<kk$BVzz!5# " 
 

 lE** 	B2LAL(-1->->. $"( .? . .L* 		,899$)<)<' 
6+/BBBuyQ^abQbGcGcB#$jkkk#7DDUKKWWWWQVWWWWW4m<<+H554}55H 		    2 .	      /$*<&4	
 
 
 	
r&   c                 ,   | j         } | |dd          }| j                            |j                  fdt	          |          D             }t          j        |d                              d|| j        j	                  }t          |          S )	a
  
        Generate sequences of sample predictions from a model with a probability distribution head.

        Args:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the target values.

        Return:
            [`SamplePatchTSMixerRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
            number of samples, num_targets)`.
        NF)r$  r#  r   c                 8    g | ]}                                 S r3  r  r  s     r%   r   z6PatchTSMixerForRegression.generate.<locals>.<listcomp>8  s2     
 
 
&'L!!
 
 
r&   r   r   r   r  )r  r  r  r)  r   rC   r  r   r5   r  r  )r#   r$  r  r  r  r  s        @r%   r  z"PatchTSMixerForRegression.generate  s    "  $8 $#!&
 
 
 /<<W=WXX
 
 
 
+01E+F+F
 
 

 +g1---2227KT[Mdee1GDDDDr&   r  )r,   r-   r.   r   r   r   rC   rD   r   r   r(  r+   r  r  r  r1   r2   s   @r%   r6  r6    s       %1 % % % % % %N  15/4 &*Z
 Z
\Z
  -Z
 'tn	Z

 Z
 d^Z
 
)Z
 Z
 Z
 ^Z
x U]__#E\#E 
,#E #E #E _#E #E #E #E #Er&   r6  )r"  r  r  r  r  r6  )Nr   N)NFr   )Nr   )NN)Rr/   rZ   dataclassesr   typingr   r   r   rC   torch.nnr   transformers.modeling_utilsr   transformers.utilsr   modeling_flash_attention_utilsr
   modeling_utilsr   processing_utilsr   time_series_utilsr   r   r   utilsr   r   configuration_patchtsmixerr   
get_loggerr,   loggerModuler   r4   rF   rh   rt   r   rD   r   r   r   r   r   r   r   r   r  r"  r5  listr   r0   rR  rh  rj  rv  r~  r  r  r  r  r  r  r  r  r  r  r  distributionsDistributionr  r  r  r  r  r(  r!  r6  __all__r3  r&   r%   <module>rR     s0   " !  ! ! ! ! ! ! , , , , , , , , , ,        7 7 7 7 7 7 * * * * * * B B B B B B 5 5 5 5 5 5 & & & & & & U U U U U U U U U U , , , , , , , , : : : : : : 
	H	%	%       *& & & & &BI & & &,$ $ $ $ $RY $ $ $N. . . . .BI . . .b    bi   .- - - - -29 - - -n  $(,% %I%<% 
% <	%
 U\*% e_% % %% % % %>U/ U/ U/ U/ U/BI U/ U/ U/pC C C C Cbi C C CL* * * * *	 * * *Z# # # # #	 # # #L&# &# &# &# &#	 &# &# &#R5 5 5 5 5BI 5 5 5pD D D D DRY D D DN ) ) ) ) )/ ) ) )2    ry   D 04',7% 7%L7%7% 'tn7% !%	7%
 7% 7% 7% 7%| 04	A% A%LA%$T3Y/A% 'tnA% 	A% A% A% A%J- - - - -29 - - -b9" 9" 9" 9" 9"") 9" 9" 9"z 0  0  0  0  0BI  0  0  0H3; 3; 3; 3; 3;RY 3; 3; 3;n         BI      6   
	= 	= 	= 	= 	= 	= 	=  	=Bk Bk Bk Bk Bk5 Bk Bk BkJ   
. . . . .k . .  .4   
^
 ^
 ^
 ^
 ^
3 ^
 ^
 
^
B   
= = = = ={ = =  =$   
Q
 Q
 Q
 Q
 Q
!< Q
 Q
 
Q
h   
. . . . .k . .  .0   2 2 2 2 2 2 2  2   2 2 2 2 2 2 2  2#u"/ # #%, # # # #* *5< *(5<:P *fkfr * * * *0TE TE TE TE TE ; TE TE TEn   
= = = = =K = =  =$k
 k
 k
 k
 k
.I k
 k
 k
\   
= = = = =k = =  =$% % % % %ry % % %P   
iE iE iE iE iE ; iE iE 
iEX  r&   