
    &`i+                     6   d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z
 ddlmZ ddlmZ ee G d d	                                  Z ed
           G d dej                              Z ed
           G d dej                              Z ed
           G d dej                              Z ed
          	 ddej        dededeeef         dej        j        f
d            Z ed
           G d dej                              ZdS )a  
Adapted from https://github.com/karpathy/minGPT

Full definition of a GPT Language Model, all of it in this single file.
References:
1) the official GPT-2 TensorFlow implementation released by OpenAI:
https://github.com/openai/gpt-2/blob/master/src/model.py
2) huggingface/transformers PyTorch implementation:
https://github.com/huggingface/transformers/blob/main/src/transformers
        /models/gpt2/modeling_gpt2.py
    N)	dataclass)Tuple)
functional)
Deprecated)DeveloperAPIc                   n    e Zd ZU eed<   dZeed<   dZeed<   dZeed<   dZe	ed<   dZ
e	ed	<   dZe	ed
<   dS )	GPTConfig
block_size   n_layern_headi   n_embedg?embed_pdropresid_pdrop
attn_pdropN)__name__
__module____qualname__int__annotations__r   r   r   r   floatr   r        q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/rllib/models/torch/mingpt.pyr	   r	      s          OOO GSFCGS KKJr   r	   F)errorc                       e Zd ZdZd ZdS )NewGELUz
    Implementation of the GELU activation function currently in Google BERT
    repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper:
    https://arxiv.org/abs/1606.08415
    c                     d|z  dt          j        t          j        dt          j        z            |dt          j        |d          z  z   z            z   z  S )Ng      ?      ?g       @gHm?g      @)torchtanhmathsqrtpipow)selfxs     r   forwardzNewGELU.forward4   s^     *IcDGm,,HuyC?P?P4P0PQ 		
r   N)r   r   r   __doc__r(   r   r   r   r   r   +   s-         

 

 

 

 

r   r   c                   0     e Zd ZdZdef fdZddZ xZS )CausalSelfAttentionz
    Vanilla multi-head masked self-attention layer with a projection at the end.
    It is possible to use torch.nn.MultiheadAttention here but I am including an
    explicit implementation here to show that there is nothing too scary here.
    configc           	      z   t                                                       |j        |j        z  dk    sJ t	          j        |j        d|j        z            | _        t	          j        |j        |j                  | _        t	          j        |j	                  | _
        t	          j        |j                  | _        |                     dt          j        t          j        |j        |j                                                dd|j        |j                             |j        | _        |j        | _        d S )Nr      bias   )super__init__r   r   nnLinearc_attnc_projDropoutr   attn_dropoutr   resid_dropoutregister_bufferr    trilonesr
   viewr&   r,   	__class__s     r   r2   zCausalSelfAttention.__init__I   s   ~-2222iFN0BCCi??Jv'899Z(:;; 	Juz&"3V5FGGHHMM1f'): 	
 	
 	
 m~r   Nc                 2   |                                 \  }}}|                     |                              | j        d          \  }}}|                    ||| j        || j        z                                dd          }|                    ||| j        || j        z                                dd          }|                    ||| j        || j        z                                dd          }||                    dd          z  dt          j        |                     d                    z  z  }	|		                    | j
        d d d d d |d |f         dk    t          d                    }	||	|z   }	t          j        |	d          }	|                     |	          }	|	|z  }
|
                    dd                                                              |||          }
|                     |                     |
                    }
|
|	fS )	N   )dimr0   r   r   z-inf)sizer5   splitr   r=   r   	transposer"   r#   masked_fillr/   r   Fsoftmaxr8   
contiguousr9   r6   )r&   r'   attention_masksBTCqkvattys              r   r(   zCausalSelfAttention.forward^   s   &&((1a ++a..&&t|&;;1aFF1aa4;&677AA!QGGFF1aa4;&677AA!QGGFF1aa4;&677AA!QGG 1;;r2&&&3166"::1F1F+FGoodi111bqb"1"5:E&MMJJ&'Ci$$$$$!GKK1((**//1a88 t{{1~~..#vr   Nr   r   r   r)   r	   r2   r(   __classcell__r?   s   @r   r+   r+   A   sb         &y & & & & & &*       r   r+   c                   0     e Zd ZdZdef fdZddZ xZS )Blockzan unassuming Transformer blockr,   c           
         t                                                       t          j        |j                  | _        t          |          | _        t          j        |j                  | _        t          j	        t          t          j        |j        d|j        z            t          j        d|j        z  |j                  t                      t          j        |j                                      | _        d S )N   )c_fcr6   actdropout)r1   r2   r3   	LayerNormr   ln_1r+   attnln_2
ModuleDictdictr4   r   r7   r   mlpr>   s     r   r2   zBlock.__init__   s    L00	'//	L00	=Yv~q6>/ABByV^!3V^DDII
6#566	  
 
r   Nc           	      <   |                      |                     |          |          \  }}||z   }| j                            | j                            | j                            | j                            |                                        }||z   }||fS )NrL   )rb   ra   rf   r_   r6   r^   r]   )r&   r'   rL   x_attrS   x_ffns         r   r(   zBlock.forward   s    YYtyy||_YMM
sI   dhmmA>N>N1O1O!P!PQQI#vr   rU   rV   rX   s   @r   rZ   rZ   }   s^        ))
y 
 
 
 
 
 

 
 
 
 
 
 
 
r   rZ   g?gffffff?modellearning_rateweight_decaybetasreturnc                 t   t                      }t                      }t          j        j        f}t          j        j        t          j        j        f}|                                 D ]\  }	}
|
                                D ]\  }}|	r|	d|n|}|                    d          r|	                    |           ;|                    d          r&t          |
|          r|	                    |           v|                    d          r%t          |
|          r|	                    |           t          |                                           ||z  }||z  }t          |          dk    sJ dt          |           d            t                                          |z
            dk    s0J dt                                          |z
             d            fdt          |          D             |d	fd
t          |          D             dd	g}t          j        j        |f||d|}|S )ai  
    This long function is unfortunately doing something very simple and is
    being very defensive: We are separating out all parameters of the model
    into two buckets: those that will experience weight decay for regularization
    and those that won't (biases, and layernorm/embedding weights). We are then
    returning the PyTorch optimizer object.
    .r/   weightr   zparameters z' made it into both decay/no_decay sets!z3 were not separated into either decay/no_decay set!c                      g | ]
}|         S r   r   .0pn
param_dicts     r   
<listcomp>z+configure_gpt_optimizer.<locals>.<listcomp>   s    >>>"z"~>>>r   )paramsrn   c                      g | ]
}|         S r   r   ru   s     r   ry   z+configure_gpt_optimizer.<locals>.<listcomp>   s    AAA"z"~AAAr           )lrro   )setr    r3   r4   r`   	Embeddingnamed_modulesnamed_parametersendswithadd
isinstancere   lenstrkeyssortedoptimAdamW)rl   rm   rn   ro   kwargsdecayno_decaywhitelist_w_modulesblacklist_w_modulesmnmrw   pfpninter_paramsunion_paramsoptim_groups	optimizerrx   s                     @r   configure_gpt_optimizerr      s   $ EEEuuH 8?, 8-ux/AB$$&& " "A'')) 	" 	"EB(*2RRR$$C
 {{6"" "S!!!!X&& ":a9L+M+M "		#X&& ":a9L+M+M "S!!!	"" e,,..//J8#L8#LLQOS&&OOO 	z  </00A555	5c*//++l:;; 	5 	5 	5 655 ?>>>u>>>(	
 	

 BAAAx0@0@AAA	
 	
	L !,X=XXQWXXIr   c                   6     e Zd ZdZdef fdZd ZddZ xZS )	GPTzGPT Transformer Modelr,   c                    t                                                       j        J j        | _        t          j        t          t          j        j                  t          j        fdt          j
                  D                       t          j        j                                      | _        |                     | j                   |                                 D ]^\  }}|                    d          rDt$          j        j                            |ddt+          j        dj
        z            z             _d S )Nc                 .    g | ]}t                    S r   )rZ   )rv   _r,   s     r   ry   z GPT.__init__.<locals>.<listcomp>   s     N N N1v N N Nr   )drophln_fzc_proj.weightr|   {Gz?rA   meanstd)r1   r2   r
   r3   rd   re   r7   r   
ModuleListranger   r`   r   transformerapply_init_weightsr   r   r    initnormal_r"   r#   )r&   r,   rw   r   r?   s    `  r   r2   zGPT.__init__   s7    ,,, +=Z 233- N N N Nfn8M8M N N NOO\&.11  
 
 	

4%&&&**,, 	 	EB{{?++ %%CTDIa&.6H,I,I%I &   	 	r   c                 f   t          |t          j                  r`t          j        j                            |j        dd           |j        +t          j        j                            |j                   d S d S t          |t          j	                  r.t          j        j                            |j        dd           d S t          |t          j
                  rTt          j        j                            |j                   t          j        j                            |j                   d S d S )Nr|   r   r   )r   r3   r4   r    r   r   rs   r/   zeros_r   r`   ones_)r&   modules     r   r   zGPT._init_weights   s    fbi(( 	/HM!!&-ct!DDD{&$$V[11111 '&-- 	/HM!!&-ct!DDDDD-- 	/HM  ---HM.....	/ 	/r   NFc                 
   |                                 \  }}}|| j        k    sJ d| d| j                     |X|                                 \  }}||k    r||k    sJ |ddddddf         }|                    |j                  }d|z
  dz  }| j                            |          }	g }
| j        j        D ]'} ||	|          \  }	}|
                    |           (| j                            |	          }	|r|	|
fS |	S )z
        input_embeds: [batch_size x seq_len x n_embed]
        attention_masks: [batch_size x seq_len], 0 don't attend, 1 attend
        z"Cannot forward sequence of length z, block size is only N)dtyper   g    erh   )	rE   r
   tor   r   r   r   appendr   )r&   input_embedsrL   return_attentionsrM   rN   rO   _B_Tr'   attsblockrS   s                r   r(   zGPT.forward  s[   
 ##%%1aDO###4 4 4"&/4 4 $##
 &$))++FB77rQwww& .aaatQQQ.>?O .00|7I0JJO"_4<O !!,//%' 	 	EU1o>>>FAsKK!!!$$ 	d7NHr   )NF)	r   r   r   r)   r	   r2   r   r(   rW   rX   s   @r   r   r      sm        y      ,	/ 	/ 	/, , , , , , , ,r   r   )rk   )r)   r"   dataclassesr   typingr   r    torch.nnr3   r   rI   ray._common.deprecationr   ray.rllib.utils.annotationsr   r	   Moduler   r+   rZ   r   r   	Optimizerr   r   r   r   r   <module>r      sq  
 
  ! ! ! ! ! !              $ $ $ $ $ $ . . . . . . 4 4 4 4 4 4 
         %
 
 
 
 
bi 
 
 
* %8 8 8 8 8") 8 8 8v %    BI   < %
 "-	> >9>> > 	> [> > > >B %P P P P P") P P P P Pr   