
    Pi&                        d dl mZmZ d dlZd dlmc mZ d dlmZmZ d dl	m
Z
  G d dej                  Z G d dej                  Z G d	 d
ej                  Zdedej        fdZdededej        fdZ G d dej                  Z G d dej                  Zdedededej        fdZ G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Zdedefd ZdS )!    )ListTupleN)nnTensor)MultiHeadAttentionc                        e Zd ZdZdeeeef         dej        dej        f fdZde	de	fdZ
de	de	fd	Zd
e	de	fdZ xZS )FluxAutoencodera  
    The image autoencoder for Flux diffusion models.

    Args:
        img_shape (Tuple[int, int, int]): The shape of the input image (without the batch dimension).
        encoder (nn.Module): The encoder module.
        decoder (nn.Module): The decoder module.
    	img_shapeencoderdecoderc                 r    t                                                       || _        || _        || _        d S N)super__init__
_img_shaper   r   )selfr
   r   r   	__class__s       v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/flux/_autoencoder.pyr   zFluxAutoencoder.__init__   s4     	#    xreturnc                 R    |                      |                     |                    S )z
        Args:
            x (Tensor): input image of shape [bsz, ch_in, img resolution, img resolution]

        Returns:
            Tensor: output image of the same shape
        )decodeencoder   r   s     r   forwardzFluxAutoencoder.forward&   s      {{4;;q>>***r   c                 `    |j         dd         | j        k    sJ |                     |          S )a   
        Encode images into their latent representations.

        Args:
            x (Tensor): input images (shape = [bsz, ch_in, img resolution, img resolution])

        Returns:
            Tensor: latent encodings (shape = [bsz, ch_z, latent resolution, latent resolution])
           N)shaper   r   r   s     r   r   zFluxAutoencoder.encode0   s1     wqrr{do----||Ar   zc                 ,    |                      |          S )a  
        Decode latent representations into images.

        Args:
            z (Tensor): latent encodings (shape = [bsz, ch_z, latent resolution, latent resolution])

        Returns:
            Tensor: output images (shape = [bsz, ch_in, img resolution, img resolution])
        )r   )r   r    s     r   r   zFluxAutoencoder.decode=   s     ||Ar   )__name__
__module____qualname____doc__r   intr   Moduler   r   r   r   r   __classcell__r   s   @r   r	   r	      s         	c3'	 	 		 	 	 	 	 	+ +F + + + + 6    
 
6 
 
 
 
 
 
 
 
r   r	   c                   X     e Zd ZdZdededee         dededef fdZd	ed
efdZ	 xZ
S )FluxEncodera	  
    The encoder half of the Flux diffusion model's image autoencoder.

    Args:
        ch_in (int): The number of channels of the input image.
        ch_z (int): The number of latent channels (dimension of the latent vector `z`).
        channels (List[int]): The number of output channels for each downsample block.
        n_layers_per_down_block (int): Number of resnet layers per upsample block.
        scale_factor (float): Constant for scaling `z`.
        shift_factor (float): Constant for shifting `z`.
    ch_inch_zchannelsn_layers_per_down_blockscale_factorshift_factorc                    t                                                       || _        || _        t	          j        |d         ddd          | _        t	          j        fdt          t                              D                       | _
        t          d                   | _        t          d         d|z            | _        d S )Nr      r   kernel_sizestridepaddingc                     g | ]F}t          |d k    r|dz
           nd          |         |t                    dz
  k               GS )r   r   )n_layersr,   ch_out
downsample)	DownBlocklen).0ir.   r/   s     r   
<listcomp>z(FluxEncoder.__init__.<locals>.<listcomp>g   sr         4-.UU(1q5//#A; 3x==1#44	    r      )r   r   r0   r1   r   Conv2dconv_in
ModuleListranger=   down	mid_blockmid	end_blockend)r   r,   r-   r.   r/   r0   r1   r   s      ``  r   r   zFluxEncoder.__init__W   s     	((y1VWXXXM     s8}}--  

 

	 Xb\**Xb\1t844r   r   r   c                     |                      |          }| j        D ]} ||          }|                     |          }|                     |          }t	          |          }| j        || j        z
  z  S )z
        Args:
            x (Tensor): input images (shape = [bsz, ch_in, img resolution, img resolution])

        Returns:
            Tensor: latent encodings (shape = [bsz, ch_z, latent resolution, latent resolution])
        )rD   rG   rI   rK   diagonal_gaussianr0   r1   )r   r   hblockr    s        r   r   zFluxEncoder.forwardv   st     LLOOY 	 	EaAAHHQKKHHQKKa   A(9$9::r   r"   r#   r$   r%   r&   r   floatr   r   r   r(   r)   s   @r   r+   r+   J   s        
 
55 5 s)	5
 "%5 5 5 5 5 5 5 5>; ;F ; ; ; ; ; ; ; ;r   r+   c                   X     e Zd ZdZdededee         dededef fdZd	ed
efdZ	 xZ
S )FluxDecodera  
    The encoder half of the Flux diffusion model's image autoencoder.

    Args:
        ch_out (int): The number of channels of the output image.
        ch_z (int): The number of latent channels (dimension of the latent vector `z`).
        channels (List[int]): The number of output channels for each upsample block.
        n_layers_per_up_block (int): Number of resnet layers per upsample block.
        scale_factor (float): Constant for scaling `z`.
        shift_factor (float): Constant for shifting `z`.
    r:   r-   r.   n_layers_per_up_blockr0   r1   c                    t                                                       || _        || _        t	          j        |d         ddd          | _        t          d                   | _        t	          j	        fdt          t                              D                       | _        t          d         |          | _        d S )Nr   r3   r   r4   c                     g | ]F}t          |d k    r|dz
           nd          |         |t                    dz
  k               GS )r   r   )r9   r,   r:   upsample)UpBlockr=   )r>   r?   r.   rT   s     r   r@   z(FluxDecoder.__init__.<locals>.<listcomp>   sr         2-.UU(1q5//#A;X!22	    r   rA   )r   r   r0   r1   r   rC   rD   rH   rI   rE   rF   r=   uprJ   rK   )r   r:   r-   r.   rT   r0   r1   r   s      ``  r   r   zFluxDecoder.__init__   s     	((yx{!UVWWWXa[))-     s8}}--  

 

 Xb\622r   r    r   c                     || j         z  | j        z   }|                     |          }|                     |          }| j        D ]} ||          }|                     |          }|S )z
        Args:
            z (Tensor): latent encodings (shape = [bsz, ch_z, latent resolution, latent resolution])

        Returns:
            Tensor: output images (shape = [bsz, ch_in, img resolution, img resolution])
        )r0   r1   rD   rI   rY   rK   )r   r    rN   rO   r   s        r   r   zFluxDecoder.forward   sk     !!D$55LLOOHHQKKW 	 	EaAAHHQKKr   rP   r)   s   @r   rS   rS      s        
 
33 3 s)	3
  #3 3 3 3 3 3 3 3> F        r   rS   chr   c                     t          j        t          | |           t          |           t          | |                     S )Nr,   r:   )r   
SequentialResnetLayer	AttnLayer)r[   s    r   rH   rH      s?    ="R(((""R(((  r   r,   r:   c                     t          j        t          j        d| dd          t          j                    t          j        | |ddd                    S )N    ư>T
num_groupsnum_channelsepsaffiner3   r   r4   )r   r^   	GroupNormSiLUrC   r]   s     r   rJ   rJ      sM    =
DNNN
		
	%Qq!DDD  r   c                   @     e Zd Zdedededef fdZdedefdZ xZS )	r<   r9   r,   r:   r;   c                     t                                                       t          |||          | _        |rt	          |          nt          j                    | _        d S r   )r   r   resnet_layerslayers
Downsampler   Identityr;   )r   r9   r,   r:   r;   r   s        r   r   zDownBlock.__init__   sO    #HeV<<0:M*V,,,r   r   r   c                 R    |                      |                     |                    S r   )r;   rn   r   s     r   r   zDownBlock.forward   s    t{{1~~...r   	r"   r#   r$   r&   boolr   r   r   r(   r)   s   @r   r<   r<      s        N NS N# N4 N N N N N N
/ /F / / / / / / / /r   r<   c                   @     e Zd Zdedededef fdZdedefdZ xZS )	rX   r9   r,   r:   rW   c                     t                                                       t          |||          | _        |rt	          |          nt          j                    | _        d S r   )r   r   rm   rn   Upsampler   rp   rW   )r   r9   r,   r:   rW   r   s        r   r   zUpBlock.__init__   sO    #HeV<<,4G((("+--r   r   r   c                 R    |                      |                     |                    S r   )rW   rn   r   s     r   r   zUpBlock.forward   s    }}T[[^^,,,r   rr   r)   s   @r   rX   rX      s        H HS H# H H H H H H H
- -F - - - - - - - -r   rX   nc                 V    t          j        fdt          |           D              S )Nc                 B    g | ]}t          |d k    rn          S )r   r]   )r_   )r>   r?   r,   r:   s     r   r@   z!resnet_layers.<locals>.<listcomp>   sB     

 

 

 qAvvee6&III

 

 

r   )r   r^   rF   )rx   r,   r:   s    ``r   rm   rm      sD    =

 

 

 

 

1XX

 

 

 r   c                   4     e Zd Zdef fdZdedefdZ xZS )r`   dimc                 b   t                                                       || _        t          j        d|dd          | _        t          |dd|t          j        ||          t          j        ||          t          j        ||          t          j        ||          d	  	        | _        d S )Nrb   rc   Trd   r   F)		embed_dim	num_headsnum_kv_headshead_dimq_projk_projv_projoutput_proj	is_causal)	r   r   r|   r   ri   normr   Linearattn)r   r|   r   s     r   r   zAttnLayer.__init__   s    LBSdSWXXX	&9S#&&9S#&&9S#&&	#s++

 

 

			r   r   r   c                 6   |j         \  }}}}|}|                     |          }t          j        d|          }|                    |||z  |          }|                     ||          }|                    ||||          }t          j        d|          }||z   S )Nzbchw -> bhwczbhwc -> bchw)r   r   torcheinsumreshaper   )r   r   bcrN   wresiduals          r   r   zAttnLayer.forward  s    W
1aIIaLL L++IIaQ""IIaOO IIaAq!!L++8|r   r"   r#   r$   r&   r   r   r   r(   r)   s   @r   r`   r`      sf        
C 
 
 
 
 
 
  F        r   r`   c                   8     e Zd Zdedef fdZdedefdZ xZS )r_   r,   r:   c                    t                                                       t          j        t          j        d|dd          t          j                    t          j        ||ddd          t          j        d|dd          t          j                    t          j        ||ddd          g | _        ||k    rt          j                    nt          j        ||ddd          | _	        d S )	Nrb   rc   Trd   r3   r   r4   r   )
r   r   r   r^   ri   rj   rC   mainrp   shortcut)r   r,   r:   r   s      r   r   zResnetLayer.__init__  s    MDQUVVV			%Qq!LLLTRVWWW			&&a1MMM	
	  KMMM5&a1MMM 	r   r   r   c                 X    |                      |          |                     |          z   S r   )r   r   r   s     r   r   zResnetLayer.forward(  s#    yy||dmmA....r   r   r)   s   @r   r_   r_     sm        
c 
3 
 
 
 
 
 
$/ /F / / / / / / / /r   r_   c                   4     e Zd Zdef fdZdedefdZ xZS )ro   r[   c                     t                                                       t          j        ||ddd          | _        d S )Nr3   rB   r   r4   r   r   r   rC   convr   r[   r   s     r   r   zDownsample.__init__-  9    Ib"!AqIII			r   r   r   c                 X    |                      t          j        |ddd                    S )N)r   r   r   r   constantr   )modevalue)r   Fpadr   s     r   r   zDownsample.forward1  s'    yyq,ZqIIIJJJr   r   r)   s   @r   ro   ro   ,  sw        J3 J J J J J JK KF K K K K K K K Kr   ro   c                   4     e Zd Zdef fdZdedefdZ xZS )rv   r[   c                     t                                                       t          j        ||ddd          | _        d S )Nr3   r   r4   r   r   s     r   r   zUpsample.__init__6  r   r   r   r   c                 V    |                      t          j        |dd                    S )Ng       @nearest)r0   r   )r   r   interpolater   s     r   r   zUpsample.forward:  s%    yyqsKKKLLLr   r   r)   s   @r   rv   rv   5  sw        J3 J J J J J JM MF M M M M M M M Mr   rv   r    c                     t          j        | dd          \  }}t          j        d|z            }||t          j        |          z  z   S )NrB   r   )r|   g      ?)r   chunkexp
randn_like)r    meanlogvarstds       r   rM   rM   >  sJ    ;q!+++LD&
)C&L
!
!C#(.....r   )typingr   r   r   torch.nn.functionalr   
functionalr   r   torchtune.modules.attentionr   r'   r	   r+   rS   r&   rH   rJ   r<   rX   rm   r`   r_   ro   rv   rM    r   r   <module>r      s                             : : : : : :
6 6 6 6 6bi 6 6 6r:; :; :; :; :;") :; :; :;z: : : : :") : : :z# ")    S # ")    / / / / /	 / / /- - - - -bi - - -S  c bi    ! ! ! ! !	 ! ! !H/ / / / /") / / /.K K K K K K K KM M M M Mry M M M/ /F / / / / / /r   