
     `i                        d Z ddlZddlZddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*  e#            rddl+m,Z, ddl-m.Z.  e&j/        e0          Z1dZ2	 ddl3m4Z4 dZ2e15                    d           n&# e6$ r Y ne7$ r e18                    d           Y nw xY w G d dej9                  Z:e2se4Z: G d dej9                  Z; G d d ej9                  Z< G d! d"ej9                  Z= G d# d$ej9                  Z> G d% d&ej9                  Z? G d' d(ej9                  Z@ G d) d*e          ZAe" G d+ d,e                      ZB G d- d.eB          ZC G d/ d0ej9                  ZD e"d12           G d3 d4eBe                      ZEd4d,gZFdS )5zPyTorch Pop2Piano model.    N)OptionalUnion)nn)CrossEntropyLoss)GenerationConfig   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )Pop2PianoConfig)	BlockMask)make_flex_block_causal_maskT)FusedRMSNormFzVDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pop2PianoLayerNormzIDiscovered apex but it failed to load, falling back to Pop2PianoLayerNormc                   &     e Zd Zd fd	Zd Z xZS )Pop2PianoLayerNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )zj
        Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/pop2piano/modeling_pop2piano.pyr&   zPop2PianoLayerNorm.__init__B   sD     	l5:k#:#:;; #    c                 h   |                     t          j                                      d                              dd          }|t          j        || j        z             z  }| j        j        t          j	        t          j
        fv r|                     | j        j                  }| j        |z  S )N   T)keepdim)tor(   float32powmeanrsqrtr+   r*   dtypefloat16bfloat16)r,   hidden_statesvariances      r0   forwardzPop2PianoLayerNorm.forwardJ   s     !##EM2266q99>>r4>PP%Ht?T4T(U(UU ; ???),,T[->??M{]**r1   )r#   )__name__
__module____qualname__r&   r@   __classcell__r/   s   @r0   r"   r"   A   sL        $ $ $ $ $ $+ + + + + + +r1   r"   c                   *     e Zd Zdef fdZd Z xZS )Pop2PianoDenseActDenseconfigc                 J   t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j	                  | _
        t          |j                 | _        d S NFbias)r%   r&   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr	   dense_act_fnactr,   rH   r/   s     r0   r&   zPop2PianoDenseActDense.__init__`   sx    )FNFKeDDD)FKeDDDz&"566&-.r1   c                    |                      |          }|                     |          }|                     |          }t          | j        j        t          j                  r]|j        | j        j        j        k    rC| j        j        j        t          j	        k    r$|
                    | j        j        j                  }|                     |          }|S N)rP   rV   rT   
isinstancerQ   r*   r(   Tensorr;   int8r6   )r,   r>   s     r0   r@   zPop2PianoDenseActDense.forwardg   s    ..//]33tw~u|44	C#tw~';;;$
22),,TW^-ABBM..r1   rA   rB   rC   r   r&   r@   rD   rE   s   @r0   rG   rG   _   sS        / / / / / / /      r1   rG   c                   *     e Zd Zdef fdZd Z xZS )Pop2PianoDenseGatedActDenserH   c                    t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j	        |j
                  | _        t          |j                 | _        d S rJ   )r%   r&   r   rM   rN   rO   wi_0wi_1rQ   rR   rS   rT   r	   rU   rV   rW   s     r0   r&   z$Pop2PianoDenseGatedActDense.__init__w   s    IfnfkFFF	IfnfkFFF	)FKeDDDz&"566&-.r1   c                    |                      |                     |                    }|                     |          }||z  }|                     |          }t	          | j        j        t          j                  r]|j	        | j        j        j	        k    rC| j        j        j	        t          j
        k    r$|                    | j        j        j	                  }|                     |          }|S rY   )rV   ra   rb   rT   rZ   rQ   r*   r(   r[   r;   r\   r6   )r,   r>   hidden_geluhidden_linears       r0   r@   z#Pop2PianoDenseGatedActDense.forward   s    hhtyy7788		-00#m3]33 tw~u|44	C#tw~';;;$
22),,TW^-ABBM..r1   r]   rE   s   @r0   r_   r_   v   sS        / / / / / / /      r1   r_   c                   *     e Zd Zdef fdZd Z xZS )Pop2PianoLayerFFrH   c                 $   t                                                       |j        rt          |          | _        nt          |          | _        t          |j        |j                  | _	        t          j        |j                  | _        d S )Nr.   )r%   r&   is_gated_actr_   DenseReluDenserG   r"   rN   layer_norm_epsilon
layer_normr   rR   rS   rT   rW   s     r0   r&   zPop2PianoLayerFF.__init__   sy     	A"=f"E"ED"8"@"@D,V^AZ[[[z&"566r1   c                     |                      |          }|                     |          }||                     |          z   }|S rY   )rm   rk   rT   )r,   r>   forwarded_statess      r0   r@   zPop2PianoLayerFF.forward   sF    ??=99../?@@%5E(F(FFr1   r]   rE   s   @r0   rg   rg      sS        7 7 7 7 7 7 7      r1   rg   c                        e Zd Z	 	 ddedee         f fdZd Zedd
            Z	ddZ
 eddd          	 	 	 	 	 	 	 	 	 dd            Z xZS )Pop2PianoAttentionFNrH   	layer_idxc                 P   t                                                       |j        | _        || _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _
        |j        | _        | j
        | j        z  | _        || _        |/| j        r(t                              d| j        j         d           t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        | j        r$t'          j        | j        | j
                  | _        t7                      | _        d| _        d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrK   )r%   r&   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerN   d_kvkey_value_proj_dim	num_headsn_headsrS   rT   	inner_dimrr   loggerwarning_oncer/   rA   r   rM   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr,   rH   ru   rr   r/   s       r0   r&   zPop2PianoAttention.__init__   su    	 ++F(.4.S+/5/U,~"(+'*(??",4>+B , , ,   4<eDDD4<eDDD4<eDDD4>4<eDDD+ 	k+-<8[]a]i+j+jD(EE&+###r1   c                    t          |          dk    rd S t          || j        | j        | j                  \  }}t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j	        |d          | _	        | j        t          |          z
  | _        | j        | j        z  | _
        | j                            |          | _        d S )Nr   r   dim)lenr   r{   ry   r   r   r   r   r   r   r|   union)r,   headsindexs      r0   prune_headszPop2PianoAttention.prune_heads   s    u::??F74<!8$:K
 
u $DFE22#DFE22#DFE22#DFEq999|c%jj004<? -33E::r1   T       c                 P   d}|rC|dz  }|| dk                         t          j                  |z  z  }t          j        |           } n(t          j        | t          j        |                      } |dz  }| |k     }|t          j        |                                 |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j	        ||dz
                      }|t          j
        || |          z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r3   r   )r6   r(   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r0   _relative_position_bucketz,Pop2PianoAttention._relative_position_bucket   s>   ,  	cAK!2Q!6 : :5: F F TT %	*; < <!&+<e>NO`>a>a!b!b b  1$	$y0 &/I'--//);<<h|i/001Y&( "UZ..	&"
 &+Y&8RT_bcTc(d(d&
 &
" 	EK2CE_```r1   c                    || j         j        j        }|,t          j        |t          j        |          dddf         }n|dddf                             |          }t          j        |t          j        |          dddf         }||z
  }|                     || j         | j	        | j
                  }|                      |          }	|	                    g d                              d          }	|	S )z%Compute binned relative position biasN)r;   device)r   r   r   )r3   r   r   r   )r   r*   r   r(   aranger   r6   r   rt   rv   rw   permute	unsqueeze)
r,   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluess
             r0   compute_biaszPop2PianoAttention.compute_bias  s   >18?F!$|L
SYZZZ[\[\[\^b[bc-aaag699&AA,zFSSSTXZ[Z[Z[T[\+.>>#'#A#A#.;=	 $B $
 $
  --.FGG			**44Q77r1   past_key_valuepast_key_values4.58new_nameversionc                    |j         dd         \  }}|du}|                     |          }|                    |d| j        | j                                      dd          }d}t          |t                    r1|j        	                    | j
                  }|r|j        }n
|j        }n|}|r|n|}|r3|1|r/|j        | j
                 j        }|j        | j
                 j        }n|                     |          }|                     |          }|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }|N|s|
nd}
|                    ||| j
        d|
i          \  }}|r$t          |t                    rd|j        | j
        <   t'          j        ||                    dd                    }||j         d	         }||n
|
d         dz   }| j        s@t'          j        d| j        ||f|j        |j        
          }| j        r| j        rd|_        n3|                     |||j        |
          }|dddd| dddf         }|$|ddddddd|j         d	         f         }||z   }| j        rUt'          j        |j         d                   }d|t?          | j                  <   |dd|                                 f         }n|}||z  }tB          j"        #                    |$                                d          %                    |          }tB          j"        &                    || j&        | j                  }|||z  }t'          j        ||          }|                    dd          '                                }|                    |d| j(                  }| )                    |          }||f}|	r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr3   r4   r   Fr   Tr   )r   r;   )r   r   r   r   )ptraining)*shaper   viewr{   ry   	transposerZ   r   
is_updatedgetrr   cross_attention_cacheself_attention_cachelayerskeysr   r   r   updater(   matmulru   zerosr   r;   r   r   requires_gradr   r   r)   listboolr   
functionalsoftmaxr   type_asrT   
contiguousr|   r   )r,   r>   maskkey_value_statesposition_biasr   layer_head_maskr   	use_cacheoutput_attentionsr   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr   real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                               r0   r@   zPop2PianoAttention.forward  su   & "/!4RaR!8
J .T9vvm,,#((RtG^__iijkmnoo 
o':;; 	2(377GGJ! K&5&K##&5&J##"1-?R))] 	F/"=*"=,3DNCHJ.5dnELLL//J66.11L#RtG^__iijkmnooJ',,ZT\4KbccmmnoqrssL*7I!St+>+E+Edn?OQ_>`, ,(
L & F*_FY*Z*Z FAEO.t~> lJ,@,@A,F,FGG #)"-J.:.FllN[]L^abLbO3 
E %j*=fm[a[g! ! ! . 74= 726M/ $ 1 1#ZVd !2 ! ! !.aaaZKLL!!!.C D"111aaa,Bj.>r.B,B#BC - ; 	1:m1!455D,-Dd'(()#0DIIKK#@  #0 && },,V\\^^,DDLLVTT},,\T\TXTa,bb &'/9Ll<>>!++Aq11<<>>!&&z2t~FFff[)). 	0/Gr1   FN)Tr   r   )NN)	NNNNNNFFN)rA   rB   rC   r   r   intr&   r   staticmethodr   r   r   r@   rD   rE   s   @r0   rq   rq      s         %*#'	!, !,!, C=	!, !, !, !, !, !,F; ; ;  -  -  -  \- ^   ( _%0A6RRR m m m SRm m m m mr1   rq   c                   p     e Zd Zd
dee         f fdZ eddd          	 	 	 	 	 	 	 dd	            Z xZS )Pop2PianoLayerSelfAttentionFNrr   c                     t                                                       t          |||          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )Nru   rr   ri   )r%   r&   rq   SelfAttentionr"   rN   rl   rm   r   rR   rS   rT   r   s       r0   r&   z$Pop2PianoLayerSelfAttention.__init__  sl    /0KW`
 
 
 -V^AZ[[[z&"566r1   r   r   r   r   c	           
          |                      |          }	|                     |	|||||||          }
||                     |
d                   z   }|f|
dd          z   }|S )N)r   r   r   r   r   r   r   r   r   )rm   r   rT   )r,   r>   attention_maskr   r   r   r   r   r   normed_hidden_statesattention_outputr   s               r0   r@   z#Pop2PianoLayerSelfAttention.forward  s      $}==-- '++/) . 	
 	
 &5Ea5H(I(II "%5abb%99r1   r   )NNNNFFN	rA   rB   rC   r   r   r&   r   r@   rD   rE   s   @r0   r   r     s        7 7XVY] 7 7 7 7 7 7 _%0A6RRR    SR    r1   r   c                   r     e Zd Zd
dee         f fdZ eddd          	 	 	 	 	 	 	 	 dd	            Z xZS )Pop2PianoLayerCrossAttentionNrr   c                     t                                                       t          |d|          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )NFr   ri   )r%   r&   rq   EncDecAttentionr"   rN   rl   rm   r   rR   rS   rT   )r,   rH   rr   r/   s      r0   r&   z%Pop2PianoLayerCrossAttention.__init__  sc    1&V[gpqqq,V^AZ[[[z&"566r1   r   r   r   r   Fc                     |                      |          }|                     |||||||||	|

  
        }||                     |d                   z   }|f|dd          z   }|S )N)	r   r   r   r   r   r   r   r   r   r   r   )rm   r   rT   )r,   r>   r   r   r   r   r   r   r   r   r   r   r   layer_outputr   s                  r0   r@   z$Pop2PianoLayerCrossAttention.forward  s      $}==// -'++%/) 0 
 
 %t||4DQ4G'H'HH/$4QRR$88r1   rY   )NNNNFNFNr   rE   s   @r0   r   r     s        7 7(3- 7 7 7 7 7 7 _%0A6RRR
    SR    r1   r   c                   z     e Zd Zddee         f fdZ eddd          	 	 	 	 	 	 	 	 	 	 	 	 dd
            Z xZS )Pop2PianoBlockFNrr   c                    t                                                       |j        | _        t          j                    | _        | j                            t          |||                     | j        r)| j                            t          ||                     | j                            t          |                     d S )Nr   )rr   )
r%   r&   rt   r   
ModuleListlayerappendr   r   rg   r   s       r0   r&   zPop2PianoBlock.__init__  s     +]__

'4O[d  	
 	
 	

 ? 	YJ:6YWWWXXX
*62233333r1   r   r   r   r   Tc                     | j         d         |||||	|
||          }|d         }|dd          }|j        t          j        k    rt          j        t          j        |                                          t          j        |j                  j        dz
  t          j        |j                  j                  }t          j	        || |          }| j
        o|d u}|r | j         d         ||||||	|d         dz   |
|	  	        }|d         }|j        t          j        k    rt          j        t          j        |                                          t          j        |j                  j        dz
  t          j        |j                  j                  }t          j	        || |          }||dd          z   } | j         d         |          }|j        t          j        k    rt          j        t          j        |                                          t          j        |j                  j        dz
  t          j        |j                  j                  }t          j	        || |          }|f}||z   S )Nr   )r   r   r   r   r   r   r   r   i  )r   maxr4   )r   r   r   r   r   r   r   r   )r   r;   r(   r<   r   isinfanyfinfor   clamprt   )r,   r>   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskr   r   r   return_dictr   self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                       r0   r@   zPop2PianoBlock.forward  s   " "/A)'++/)	"
 	"
 	"
 /q121226 %-//+M**..00M/004t;M/004 K
 "KK<[YYYM!_R1Fd1R 	P&3djm!65; : /+B/!3#"3
' 
' 
'# 4A6M "em33#kK..2244K 34484?K 3448 
 !&M|Q\ ] ] ] !24KABB4O O '
2}55 %-//+M**..00M/004t;M/004 K
 "KK<[YYYM " ''	
r1   r   )NNNNNNNNFFTNr   rE   s   @r0   r   r     s        4 4XVY] 4 4 4 4 4 4 _%0A6RRR "#&*#'Q
 Q
 Q
 SRQ
 Q
 Q
 Q
 Q
r1   r   c                   B    e Zd ZU eed<   dZdZdZdZdgZ	dgZ
d Zd Zd	S )
Pop2PianoPreTrainedModelrH   transformerFTr   rQ   c                 	   | j         j        }t          |t                    r$|j        j                            |dz             dS t          |t                    r+|j        j        j        	                    d|dz             dS t          |t                    rt|j        j        j        	                    d|dz             t          |d          r7| j         j        s-|j        j        j        	                    d|dz             dS dS dS t          |t                    r|j        j        j        	                    d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j        	                    d|| j         j        dz  z             t          |j        d          r1|j        j        '|j        j        j                                         dS dS dS t          |t,                    rt|j        j        j        	                    d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j        	                    d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j        	                    d|| j         j        dz  z             t          |j        d          r1|j        j        '|j        j        j                                         dS dS dS t          |t2                    r| j         j        }| j         j        }| j         j        }|j        j        j        	                    d|||z  dz  z             |j        j        j        	                    d||dz  z             |j        j        j        	                    d||dz  z             |j        j        j        	                    d|||z  dz  z             |j         r0|j!        j        j        	                    d||dz  z             dS dS dS )zInitialize the weights      ?        )r9   stdlm_head      rL   N)"rH   initializer_factorrZ   r"   r*   datafill_Pop2PianoConcatEmbeddingToMel	embeddingnormal_!Pop2PianoForConditionalGenerationsharedhasattrtie_word_embeddingsr  rG   rP   rN   rL   zero_rQ   rO   r_   ra   rb   rq   rx   rz   r   r   r   r   ru   r   )r,   modulefactorrN   ry   r{   s         r0   _init_weightsz&Pop2PianoPreTrainedModel._init_weightsN  s   /f011 )	oM$$Vc\22222 =>> '	o#(00cv|0LLLLL ABB %	o M %--3FSL-IIIvy)) O$+2Q O%*22#2NNNNNO O O O 677 	o I!))s4;CV[_B_8`)aaavy&)) ,fin.H	#))+++I!))s4;CSX\B\8])^^^vy&)) ,fin.H	#))+++++, ,.H.H ;<< 	oK#++&T[EX]aDa:b+cccv{F++ .0@0L %++---K#++&T[EX]aDa:b+cccv{F++ .0@0L %++---I!))s4;CSX\B\8])^^^vy&)) ,fin.H	#))+++++, ,.H.H 233 	o k)G!%!1k+GHO ((cv'L^B^cgAg7h(iiiHO ((cv$7O(PPPHO ((cv$7O(PPPHO ((cv'L^B^cgAg7h(iii1 o.5:BBQW\chl[lQmBnnnnn	o 	oo or1   c                    | j         j        }| j         j        }|t          d          t	          |          rHt          j        |j        d d         dz   |          }t          j        ||dd df         gd          }nD|	                    |j                  }|dd df         
                                |ddd f<   ||d<   |t          d          |                    |d	k    |           |S )
Nzoself.model.config.decoder_start_token_id has to be defined. In Pop2Piano it is usually set to the pad_token_id.r4   )r   .r   r   ).r   z1self.model.config.pad_token_id has to be defined.)rH   decoder_start_token_idpad_token_id
ValueErrorr   r(   fullr   cat	new_zerosclonemasked_fill_)r,   	input_idsr'  r(  shifted_input_idss        r0   _shift_rightz%Pop2PianoPreTrainedModel._shift_right|  s   !%!C{/!) B  
 Y'' 	? %
9?3B3+?$+FH^ _ _ %	+<iSbS>Q*RXZ [ [ [ ) 3 3IO D D)238)<)B)B)D)Dc122g&(>f%PQQQ&&'8D'@,OOO  r1   N)rA   rB   rC   r   __annotations__base_model_prefixis_parallelizablesupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulesr$  r1   r1   r0   r  r  C  sm         %&*#")*!F,o ,o ,o\! ! ! ! !r1   r  c                        e Zd Zd fd	Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 ddeej        df         dej        d	ej        d
e	de
f
dZedej        dededej        d	ej        defd            Z xZS )Pop2PianoStackNc                    t                                                     || _        j        | _        t	          j        fdt          j                  D                       | _        t          j
        j                  | _        t	          j        j                  | _        |                                  d| _        d | _        d| _        d S )Nc           	      V    g | ]%}t          t          |d k              |          &S )r   r   )r   r   ).0irH   s     r0   
<listcomp>z+Pop2PianoStack.__init__.<locals>.<listcomp>  sC        v4Q<<[\]]]  r1   ri   F)r%   r&   embed_tokensrt   r   r   range
num_layersblockr"   rN   rl   final_layer_normrR   rS   rT   	post_initmodel_parallel
device_mapr   )r,   rH   rA  r/   s    ` r0   r&   zPop2PianoStack.__init__  s       ( +]   v011  
 

 !36>vG` a a az&"566 	#&+###r1   c                     || _         d S rY   )rA  r,   new_embeddingss     r0   set_input_embeddingsz#Pop2PianoStack.set_input_embeddings  s    *r1   c                    |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }|#|!| j        rdnd}t          d| d| d          |1|                                }|                    d|d                   }n@||                                d d         }n!| j        rdnd}t          d| d| d	          | j	        r%| j
        r|	rt                              d
           d}	|+| j        t          d          |                     |          }|\  }}|	du r| j        st          d|  d          | j        r]|	rZ|X| j         j        r7t          t!          | j                   t!          | j                             }nt!          | j                   }n	| j        sd }||                                nd}|t%          j        |||z   |j                  }|/t+                      s!||z   }t%          j        |||j                  }| j         j        r6|                     |||t1          |t                    r|j        n||
          }nO|d d d d d d f         }|                    |j                  }d|z
  t%          j        |j                  j        z  }| j        rQ|O|                                \  }}}||f}|t%          j        ||j                  }|                     |          }nd }|                     || j         j                   }|                     || j         j                   }|rdnd }|
rdnd }|
r	| j        rdnd }d }d }| !                    |          }tE          | j#                  D ]z\  } }!||          }"||          }#|r||fz   } |!|||||||"|#||	|
|          }$|$d         }|$d         }| j        r||$|
rdnd         }|
r||$d         fz   }| j        r||$d         fz   }{| $                    |          }| !                    |          }|r||fz   }|stK          d |||||fD                       S tM          |||||          S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer4   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoder)rH   r   r   )r;   r  r9  )r   r  r   r   r   r   r   r   r3      c              3      K   | ]}||V  	d S rY   r9  )r>  r   s     r0   	<genexpr>z)Pop2PianoStack.forward.<locals>.<genexpr>R  s4       
 
 =  !===
 
r1   )last_hidden_stater   r>   
attentionscross_attentions)'rH   r   r   output_hidden_statesuse_return_dictrt   r)  sizer   r   r   r}   r~   rA  is_encoder_decoderr   r   get_seq_lengthr(   r   r   r   r)   _update_causal_maskrZ   r   r6   r;   r  r   invert_attention_maskget_head_maskrC  rT   	enumeraterD  rE  tupler   )%r,   r/  r   r  r  rP  	head_maskcross_attn_head_maskr   r   r   rX  r  r   err_msg_prefixinput_shaper   r   past_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr   r  r>   r?  layer_moduler   r  layer_outputss%                                        r0   r@   zPop2PianoStack.forward  s     "+!6IIDK<Q	1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>+/?BZZNw>wwwww   "#..**K!r;r?;;II&',,..ss3KK+/?BZZNu>uuXfuuuvvv& 	"4= 	" "##p   "	  ( !_``` --i88M!,
J? j !hT!h!h!hiii? 	# G_4;1 G&9$DK888,dk:Z:Z:Z' 'OO '3$+&F&F&FO 	# #OETE`!?!?!A!A!Afg!"\&(>(KTaTh  N !*B*D*D!4zAO"Z
OML`aaaN;! 	U22o/BCC%44$! KK )D$)9:K%..}/B.CCK,M<O0P0P0TTK ? 	34@=R=W=W=Y=Y: 7$68O#P %-).4HQ^Qe)f)f)f&.2.H.HI_.`.`++.2+ &&y$+2HII	#112FH^__"6@BBD0:d&7VDOVrrRV(,%]33(44 !	V !	VOA|'lO)=a)@&# I$58H$H!(L%/- /+E /#"3-  M *!,M
 *!,M ]#8#D0=CT>[aaZ[0\-  V!/=3C2E!E? V+?=QRCSBU+U(--m<<]33   	E 1]4D D 	 
 
 "#%"(
 
 
 
 
 
 9+++%1
 
 
 	
r1   Fr   r   input_tensorr   r   r   c           	      $   | j         j        dk    r||dk                                    r|S d S | j         j        dk    r+t          |t          j                  rt          |          }|S ||                                nd}||j        nd}| j         j        dk    r#|s!|st          j
        |||| j                  rd S |j        }|j        d         }	|r|                                }
n/t          |t          j                  r|j        d	         n||	z   dz   }
|                     ||	|
|||j        d         
          }| j         j        dk    r@|>|j        j        dv r0|s.t	          j        |          j        }t          j        ||          }|S )Nflash_attention_2r  flex_attentionr   Fsdpa)rP  rf  is_trainingr   r4   )sequence_lengthtarget_lengthr;   r   r   )cudaxpunpu)rH   _attn_implementationr  rZ   r(   r[   r   r\  is_compileabler   _ignore_causal_mask_sdpar   r;   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer  r   _unmask_unattended)r,   r   rr  r   r   r   past_seen_tokensusing_compilable_cacher;   rx  ry  r   	min_dtypes                r0   r]  z"Pop2PianoStack._update_causal_maskf  s    ;+/BBB)~/D.I.I.K.K)%%4;+/???.%,77 M!<^!L!L!!
 @O?Z?99;;;`aCRC^!?!?di ;+v55>T5]n5%>*'7 M	    t"&,Q/! 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*.DDD% E E**.I0CKQZ[[Kr1   rx  ry  r;   r   c                    | |                                  dk    r| }nMt          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	|ddddddd|	f         | ddddddf                             |j                  z   }
|
dk    }
|ddddddd|	f                             |
|          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        NrR  )
fill_valuer;   r   r   )diagonalrQ  r4   r   )r   r(   r  r   r*  r   triur   reshapeexpandr-  r   r6   masked_fill)r   rx  ry  r;   r   r   kwargsr   r  mask_lengthpadding_masks              r0   r  zDPop2PianoStack._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdDgDg&E E    ,q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r1   rY   )NNNNNNNNNNNNN)F)rA   rB   rC   r&   rL  r@   r   r(   r[   r
   r   r]  r   r   r;   r  rD   rE   s   @r0   r;  r;    sU       , , , , , ,.+ + +
 "#!!p
 p
 p
 p
r #(B BelK78B lB 	B
 B  B B B BH 444 4 {	4
 4 4 4 4 \4 4 4 4 4r1   r;  c                   (     e Zd ZdZ fdZd Z xZS )r  z'Embedding Matrix for `composer` tokens.c                     t                                                       t          j        |j        |j                  | _        d S )N)num_embeddingsembedding_dim)r%   r&   r   r   composer_vocab_sizerN   r  rW   s     r0   r&   z&Pop2PianoConcatEmbeddingToMel.__init__  s:    V5O_e_mnnnr1   c                     ||z
  }|                      |                              d          }t          j        ||gd          }|S )Nr   r   )r  r   r(   r+  )r,   featureindex_valueembedding_offsetindex_shiftedcomposer_embeddingrP  s          r0   r@   z%Pop2PianoConcatEmbeddingToMel.forward  sM    #&66!^^M::DDQGG	#5w"?QGGGr1   )rA   rB   rC   __doc__r&   r@   rD   rE   s   @r0   r  r    sR        11o o o o o      r1   r  zA
    Pop2Piano Model with a `language modeling` head on top.
    )custom_introc            *           e Zd Zg dZdef fdZd Zd Zd Z	 d"de	j
        d	ed
edee	j
                 fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dee	j                 dee	j
                 dee	j                 dee	j                 dee	j
                 dee	j
                 dee	j                 deeee	j                                   dee         dee	j
                 dee	j
                 dee	j
                 dee	j                 dee         dee         dee         dee         dee	j                 deee	j
                 ef         f&d            Z e	j                    	 	 	 d$ fd 	            Zde	j        fd!Z xZS )%r  )zencoder.embed_tokens.weightzdecoder.embed_tokens.weightzlm_head.weightrH   c                 j   t                                          |           || _        |j        | _        t          j        |j        |j                  | _        t          |          | _
        t          j        |          }d|_        d|_        d|_        t!          || j                  | _        t          j        |          }d|_        d|_        |j        |_        t!          || j                  | _        t          j        |j        |j        d          | _        |                                  d S )NFTrK   )r%   r&   rH   rN   	model_dimr   r   
vocab_sizer  r  mel_conditionercopydeepcopyrt   r   tie_encoder_decoderr;  encodernum_decoder_layersrC  decoderrM   r  rF  )r,   rH   encoder_configdecoder_configr/   s       r0   r&   z*Pop2PianoForConditionalGeneration.__init__  s       l6#4fnEE<VDDv..$)!#( -2*%ndkBBv..$(!-2*$*$=!%ndkBBy1BOOO 	r1   c                     | j         S rY   )r  r,   s    r0   get_input_embeddingsz6Pop2PianoForConditionalGeneration.get_input_embeddings  s
    {r1   c                 |    || _         | j                            |           | j                            |           d S rY   )r  r  rL  r  rJ  s     r0   rL  z6Pop2PianoForConditionalGeneration.set_input_embeddings  s;    $)).999)).99999r1   c                     | j         S rY   )r  r  s    r0   get_encoderz-Pop2PianoForConditionalGeneration.get_encoder  s
    |r1   Ninput_featurescomposergeneration_configr   c                 <   |j         }||vr4t          dt          |                                           d|           ||         }t	          j        || j                  }|                    |j        d                   }t          |
                                          }|                     |||          }|\d||dddf                                          <   t	          j        |dddf                             dd	          |gd	
          }||fS |dfS )a  
        This method is used to concatenate mel conditioner tokens at the front of the input_features in order to
        control the type of MIDI token generated by the model.

        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                input features extracted from the feature extractor.
            composer (`str`):
                composer token which determines the type of MIDI tokens to be generated.
            generation_config (`~generation.GenerationConfig`):
                The generation is used to get the composer-feature_token pair.
            attention_mask (``, *optional*):
                For batched generation `input_features` are padded to have the same shape across all examples.
                `attention_mask` helps to determine which areas were padded and which were not.
                - 1 for tokens that are **not padded**,
                - 0 for tokens that are **padded**.
        zPlease choose a composer from z. Composer received - rQ  r   )r  r  r  Nr  r4   r   )axis)composer_to_feature_tokenr)  r   r   r(   tensorr   repeatr   r   r   r  r   concatenater   )r,   r  r  r  r   r  composer_valuer  s           r0   get_mel_conditioner_outputsz=Pop2PianoForConditionalGeneration.get_mel_conditioner_outputs  sN   0 %6$O!444y6O6T6T6V6V1W1Wyyowyy   38<nT[III'..~/CA/FGG8??AABB--"&- . 
 

 %;>NN111a40557778 #.qqq!t0D0I0I"a0P0PR`/ahijjjN!>11t##r1   r/  decoder_input_idsdecoder_attention_maskrb  decoder_head_maskrc  encoder_outputsr   rP  decoder_inputs_embedslabelsr   r   rX  r  r   returnc                    ||n| j         j        }||n| j         j        }|
|t          d          ||
|}
||                     |||
||||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|d         }||||                     |          }| 	                    ||||	|||||||||          }|d         }| j         j
        r|| j        d	z  z  }|                     |          }d}|Vt          d
          } ||                    d|                    d                    |                    d                    }|s|f|dd         z   |z   }||f|z   n|S t!          |||j        |j        |j        |j        |j        |j        |j        	  	        S )a2
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pop2Piano is a model with relative position embeddings
            so you should be able to pad the inputs on both the right and the left. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for detail.
            [What are input IDs?](../glossary#input-ids) To know more on how to prepare `input_ids` for pretraining
            take a look a [Pop2Piano Training](./Pop2Piano#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
            [What are decoder input IDs?](../glossary#decoder-input-ids) Pop2Piano uses the `pad_token_id` as the
            starting token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last
            `decoder_input_ids` have to be input (see `past_key_values`). To know more on how to prepare
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`
        NzSBoth `inputs_embeds` and `input_features` received! Please provide only one of them)r/  r   rP  rb  r   rX  r  r   r   r3   )rU  r>   rV  )r/  r   rP  r   r  r  rb  rc  r   r   rX  r  r   r  r&  )ignore_indexr4   )	losslogitsr   decoder_hidden_statesdecoder_attentionsrW  encoder_last_hidden_stater  encoder_attentions)rH   r   rY  r)  r  rZ   r   r   r1  r  r   r  r  r   r   rZ  r   r   r>   rV  rW  rU  )r,   r/  r   r  r  rb  r  rc  r  r   rP  r  r  r  r   r   rX  r  r   r>   decoder_outputssequence_output	lm_logitsr  loss_fctoutputs                             r0   r@   z)Pop2PianoForConditionalGeneration.forwardP  s   j "+!6IIDK<Q	%0%<kk$+B]$)Crsss'M,A*M ""ll#-+#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O (*"3";@U@] $ 1 1& 9 9 ,,'1/+"/#1'!5/!5#) ' 
 
  *!,;* 	G .1EFOLL11	'T:::H8INN2y~~b/A/ABBFKKPROOTTD 	F\OABB$77/IF)-)9TGf$$vE+;"1"?.9,=&5&G"1"?.9

 

 

 
	
r1   	composer1c                    || j         } |j        d	i | t          |d          st          d          t	          |j                  | j        j        k    r2t          d| j        j         dt	          |j                   d          |                     ||||          \  }} t                      j
        d	d|||d|S )
a  
        Generates token ids for midi outputs.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation
        strategies and code examples, check out the [following guide](./generation_strategies).

        </Tip>

        Parameters:
            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                This is the featurized version of audio generated by `Pop2PianoFeatureExtractor`.
            attention_mask:
                For batched generation `input_features` are padded to have the same shape across all examples.
                `attention_mask` helps to determine which areas were padded and which were not.
                - 1 for tokens that are **not padded**,
                - 0 for tokens that are **padded**.
            composer (`str`, *optional*, defaults to `"composer1"`):
                This value is passed to `Pop2PianoConcatEmbeddingToMel` to generate different embeddings for each
                `"composer"`. Please make sure that the composer value is present in `composer_to_feature_token` in
                `generation_config`. For an example please see
                https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json .
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            kwargs:
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
                Since Pop2Piano is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:
                    - [`~generation.GenerateEncoderDecoderOutput`],
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
        Nr  z`composer_to_feature_token` was not found! Please refer to https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.jsonand parse a dict like that.ztconfig.composer_vocab_size must be same as the number of keys in generation_config.composer_to_feature_token! Found z vs .)r  r   r  r  )inputsrP  r   r  r9  )r  r   r  r)  r   r  rH   r  r  r%   generate)r,   r  r   r  r  r  r/   s         r0   r  z*Pop2PianoForConditionalGeneration.generate  s2   l $ $ 6  **6*** (*EFF 	.    :;;t{?^^^r8r r>ABSBm>n>nr r r   *.)I)I))/	 *J *
 *
&  uww 
()/	
 

 
 
 	
r1   c                 ,    |                      |          S rY   )r1  )r,   r  s     r0   %prepare_decoder_input_ids_from_labelszGPop2PianoForConditionalGeneration.prepare_decoder_input_ids_from_labels/  s      (((r1   rY   )NNNNNNNNNNNNNNNNNN)Nr  N)rA   rB   rC   _tied_weights_keysr   r&   r  rL  r  r(   FloatTensorstrr   r   r  r   
LongTensor
BoolTensorr[   ra  r
   r   r   r   r@   no_gradr  r  rD   rE   s   @r0   r  r    s        jii      6  : : :
   7;/$ /$)/$ /$ ,	/$
 !!23/$ /$ /$ /$b  156:8<=A159=7;@D+/596:=A-1$(,0/3&*59'B
 B
E,-B
 !!23B
 $E$45	B

 !))9 :B
 E-.B
 $E$56B
 'u|4B
 "%el(;"<=B
 "%B
   12B
 !!23B
  ((9:B
 )*B
 D>B
  $D>!B
" 'tn#B
$ d^%B
& !!12'B
( 
uU&'8	9)B
 B
 B
 ^B
H U]__ W
 W
 W
 W
 W
 _W
r)EL ) ) ) ) ) ) ) )r1   r  )Gr  r  r   typingr   r   r(   r   torch.nnr   transformers.generationr   activationsr	   cache_utilsr
   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   utils.deprecationr   configuration_pop2pianor   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerrA   r}   _load_pop2piano_layer_normapex.normalizationr    infoImportError	ExceptionwarningModuler"   rG   r_   rg   rq   r   r   r   r  r;  r  r  __all__r9  r1   r0   <module>r     s       " " " " " " " "        % % % % % % 4 4 4 4 4 4 ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) > > > > > > 9 9 9 9 9 9 k k k k k k k k k k - - - - - - Q Q Q Q Q Q Q Q w w w w w w w w w w w w w w 0 0 0 0 0 0 4 4 4 4 4 4  !! K;;;;;;JJJJJJ 
	H	%	%! 	//////!&
KKhiiii 	 	 	D 	 	 	
NN^___D	+ + + + + + + +2 " &%    RY   .    ")   <    ry   &f f f f f f f fT" " " " "") " " "L$ $ $ $ $29 $ $ $Pa
 a
 a
 a
 a
/ a
 a
 a
H P! P! P! P! P! P! P! P!fI I I I I- I I IX
    BI      
z) z) z) z) z)(@/ z) z) 
z)z	 /0J
Ks   &C C'C'&C'