
    wif              &          d Z ddlmZmZmZmZmZ ddlZddlmZ ddl	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddgZ G d	 de          Zd
de de de de
 de dz   e_         dee         dee         dee         dee         dee         dee         dededededededededededef"d Zdee         dee         dee         dee         dee         dee         dededededededededededef"d!Z ee"          	 	 	 	 	 	 d&dee         dee         dee         dee         dee         dee         ded$ee         dedededededededededef$d%            ZdS )'z'Implementation for the NAdam algorithm.    )castListOptionalTupleUnionN)Tensor   )_capturable_doc_default_to_fused_or_foreach_differentiable_doc_disable_dynamo_if_unsupported_foreach_doc!_get_capturable_supported_devices_get_scalar_dtype
_get_value_maximize_doc_params_doc_stack_if_compiling_use_grad_for_differentiable_view_as_real	OptimizerParamsTNAdamnadamc                        e Zd Z	 	 	 	 	 	 ddddddd	ed
eeef         deeef         dedededede	e         dededef fdZ
 fdZd Zedd            Z xZS )r   Mb`?g?g+?:0yE>r   Mbp?FN)foreachmaximize
capturabledifferentiableparamslrbetasepsweight_decaymomentum_decaydecoupled_weight_decayr    r!   r"   r#   c                j   t          |t                    r'|                                dk    rt          d          d|k    st          d|           d|k    st          d|           d|d         cxk    rdk     sn t          d|d                    d|d         cxk    rdk     sn t          d	|d                    d|k    st          d
|           d|k    st          d|           t	          |||||||	||
|
  
        }t                                          ||           d S )Nr	   zTensor lr must be 1-element        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: zInvalid momentum_decay value: )
r%   r&   r'   r(   r)   r*   r!   r    r"   r#   )
isinstancer   numel
ValueErrordictsuper__init__)selfr$   r%   r&   r'   r(   r)   r*   r    r!   r"   r#   defaults	__class__s                a/root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/torch/optim/nadam.pyr3   zNAdam.__init__    s    b&!! 	<bhhjjAoo:;;;byy;r;;<<<czz<s<<===eAh$$$$$$$$M58MMNNNeAh$$$$$$$$M58MMNNNl""JLJJKKKn$$NnNNOOO%)#9!)
 
 
 	*****    c                    t                                          |           | j        D ]}|                    dd           |                    dd            |                    dd           |                    dd           |                    dd           |d         D ],}| j                            |g           }t          |          dk    rt          j        |d	                   sjt          |d	                   }|d         r(t          j
        |t                      |j        
          n!t          j
        |t                                |d	<   t          j        |d                   s]|d         }|d         r(t          j
        |t                      |j        
          n!t          j
        |t                                |d<   .d S )Nr!   Fr    r"   r#   r*   r$   r   stepdtypedevicer<   
mu_product)r2   __setstate__param_groups
setdefaultstategetlentorch	is_tensorfloattensorr   r=   )r4   rC   grouppp_statestep_valmu_prod_valr6   s          r7   r@   zNAdam.__setstate__K   s   U###& 	 	EZ///Y---\5111-u5555u===8_  *..B//w<<1$$ ?76?;; #(#9#9
  %\2SEL (0A0C0CAH    "'h>O>Q>Q!R!R!R   !?7<+@AA &-l&;
  %\2VEL +3D3F3Fqx    "'kARATAT!U!U!U  -	 	r8   c                    d}|d         D ]}	|	j         |t          j        |	          z  }|                    |	           |	j         j        rt          d          |                    |	j                    | j        |	         }
t          |
          dk    r|d         r(t          j        dt                      |	j
                  n!t          j        dt                      	          |
d
<   |d         r(t          j        dt                      |	j
                  n!t          j        dt                      	          |
d<   t          j        |	t          j                  |
d<   t          j        |	t          j                  |
d<   |                    |
d                    |                    |
d                    |                    |
d                    |                    |
d
                    |S )NFr$   z'NAdam does not support sparse gradientsr   r"    r;   r,   r>   r:   r-   r?   )memory_formatexp_avg
exp_avg_sq)gradrF   
is_complexappend	is_sparseRuntimeErrorrC   rE   zerosr   r=   rI   ones
zeros_likepreserve_format)r4   rJ   params_with_gradgradsexp_avgsexp_avg_sqsmu_productsstate_stepshas_complexrK   rC   s              r7   _init_groupzNAdam._init_groupi   s    x $	2 $	2Av!u/222 ''***6# R&'PQQQQV$$$
1u::?? !.JB.?.A.A!(SSSS"\#5F5H5HIII &M !.J
2->-@-@RRRR"\#5F5H5HIII ,' (-'7)>( ( (E)$ +0*:)>+ + +E,' i 0111""5#6777""5#6777""5=111r8   c                    |                                   d}|5t          j                    5   |            }ddd           n# 1 swxY w Y   | j        D ]}g }g }g }g }g }g }	t	          t
          t          t          f         |d                   \  }
}|                     |||||||	          }t          ||||||	|
||d         |d         |d         |d         |d         |d         |d	         |d
         |d         |           |S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr&   r%   r(   r)   r'   r!   r*   r    r"   r#   )beta1beta2r%   r(   r)   r'   r!   r*   r    r"   r#   rc   )	 _cuda_graph_capture_health_checkrF   enable_gradrA   r   r   rH   rd   r   )r4   closurelossrJ   r]   r^   r_   r`   ra   rb   rf   rg   rc   s                r7   r:   z
NAdam.step   s    	--///"$$ ! !wyy! ! ! ! ! ! ! ! ! ! ! ! ! ! ! & &	 &	E-/"$E%'H(*K(*K(*KeUl 3U7^DDLE5**  K  ;">2$%56%Lz*',-E'Fi( .$%56'%    * s   AA
A)r   r   r   r   r   FN)__name__
__module____qualname__r   r   rH   r   r   boolr   r3   r@   rd   r   r:   __classcell__)r6   s   @r7   r   r      s;        $(%1 $',)+ #' $)+ )+ )+)+ %- )+ UE\"	)+
 )+ )+ )+ !%)+ $)+ )+ )+ )+ )+ )+ )+ )+ )+V    <0 0 0d "6 6 6 "!6 6 6 6 6r8   a  Implements NAdam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)},
                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
            &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)}    \\
            &\hspace{13mm} \: \textit{decoupled\_weight\_decay}, \:\textit{maximize}             \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0 \leftarrow 0 \text{ ( second moment)}                                 \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1}                                       \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm}\textbf{if} \: \textit{decoupled\_weight\_decay}                       \\
            &\hspace{15mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1}                    \\
            &\hspace{10mm}\textbf{else}                                                          \\
            &\hspace{15mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2}  0.96^{t \psi} \big)     \\
            &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex]
            & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i})                         \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.
    z
    Args:
        a  
        lr (float, Tensor, optional): learning rate (default: 2e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        momentum_decay (float, optional): momentum momentum_decay (default: 4e-3)
        decoupled_weight_decay (bool, optional): whether to use decoupled weight
            decay as in AdamW to obtain NAdamW (default: False)
        z	
        z

    .. _Incorporating Nesterov Momentum into Adam:
        https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ
    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101

    r$   r^   r_   r`   ra   rb   rf   rg   r%   r(   r)   r'   r*   r!   r"   r#   rc   c          
      
   t          |           D ]\  }}|s||         n||          }||         }||         }||         }||         }t          j        |          rPt          j        |          }t          j        |          }t          j        |          }t          j        |          }t          j                                        sZ|rXt                      }|j        j        |j        j        cxk    r|j        j        k    rn n|j        j        |v sJ d| d            |dz  }|r|}nt          |          }d||z  z
  }|	dk    r5|r|
                    d||	z  z
             n|                    ||	          }|ddd||
z  z  z  z
  z  }|ddd|dz   |
z  z  z  z
  z  }||z  }|                    |d|z
             |
                    |                              ||d|z
  	           |                    |                                          }|s|ri|                    |          }||z  }|| d|z
  z  d|z
  z  z  }|| |z  d|z
  z  z  }|                    ||           |                    ||           vt          |          |z  }|                    |           |                    ||| d|z
  z  dt          |          z
  z  	           |                    ||| |z  d|z
  z  	           d S )
NzVIf capturable=True, params, mu_products and state_steps must be on supported devices: .r	   r   alphar-         ?Q?)value)	enumeraterF   rU   view_as_realcompileris_compilingr   r=   typer   mul_addlerp_addcmul_divsqrtaddcdiv_add_)r$   r^   r_   r`   ra   rb   rf   rg   r%   r(   r)   r'   r*   r!   r"   r#   rc   iparamrT   rR   rS   r?   step_tcapturable_supported_devicesr:   bias_correction2mumu_nextdenommu_product_nexts                                  r7   _single_tensor_nadamr     s   ( f%% G G5'6uQxxeAhY1+ ^
 ^
QE"" 	8&u--E%d++D(11G+J77J ~**,, 	 	+L+N+N(!Z%6%;QQQQv}?QQQQQQL%)EEEEI)EI I I FEF 	! 	&DDf%%Dud{?1% ;

1rL001111xx\x:: cC4D>,A#BCCD3$(n1L(M!NNO 	b
 	dAI&&&''d!e)'DDD/005577 	Z 	IIcNNE )72OB3#(+sZ/?@AD"w#2G!HIGNN4'''NN7E****(44w>OJJsOOONNeRC38$4j>T>T8T$U     NNsW}9N&O     KG Gr8   c                  
( t          |           dk    rd S |r
J d            t          j                                        sJ|rHt	          d          (t          (fdt          | ||          D                       sJ d( d            t          j        | |||||g          }|	                                D ]1\  \  }}}}}}}t          t          t                   |          }t          t          t                   |          }t          t          t                   |          }t          t          t                   |          }t          t          t                   |          }t          t          t                   |          }|rt          ||||           |rt          j        |          }t          j                                        s9|d         j        r,t          j        |t          j        dd	
          d           nt          j        |d           |	dk    rO|rt          j        |d|	z  z
             n1|rt          j        |||	           nt          j        |||	          }t          j        ||dz
             t          j        |           t          j        |||dz
             t          j        |          }|r&t          j        |
          } t          j        d|           }!t          j        |!d           t          j        |!d           t          j        |!           t          j        | 
           t          j        d|           }"t          j        |"d           t          j        |"d           t          j        |"           ~ t          j        |          }#t          j        |#d           t          j        |#           t          j        |#           n,fd|D             }#
fd|D             }!
fd|D             }"t          j        ||!           t          j        ||#           t          j        ||           ~#|rt          j        |!d           t          j        |!           t          j        |d          }$t          j        |$           t          j        |!|$           |!}%~$t          j        ||"          }$t          j        |"           t          j        |$d           t          j        |"|$           |"}&~$t          j        |%|          }'t          j        |'|&|           t          j        ||'|           t?          fdt          ||!          D                       }%t?          fdt          ||"          D                       }&t          j        ||||%           t          j        ||||&           3d S )Nr   z#_foreach ops don't support autogradF)supports_xlac              3      K   | ]D\  }}}|j         j        |j         j        cxk    o|j         j        k    nc o|j         j        v V  Ed S rl   )r=   r}   ).0rK   mpr:   r   s       r7   	<genexpr>z&_multi_tensor_nadam.<locals>.<genexpr>  sz       
 
 2t HMRY^????t{/????? >!==
 
 
 
 
 
r8   zWIf capturable=True, params, mu_products, and state_steps must be on supported devices: rs   r-   cpu)r=   rt   r	   rw   g      c                 @    g | ]}d t          |          z  z
  dz  S )r	   rv   r   )r   r:   rg   s     r7   
<listcomp>z'_multi_tensor_nadam.<locals>.<listcomp>  s=     $ $ $;?Uj....36$ $ $r8   c           	      L    g | ] }d ddt          |          z  z  z  z
  z  !S )r-   rv   rw   r   r   r:   rf   r)   s     r7   r   z'_multi_tensor_nadam.<locals>.<listcomp>  sJ        sdz$/?/?./P&QRRS  r8   c           	      R    g | ]#}d ddt          |          dz   z  z  z  z
  z  $S )r-   rv   rw   r	   r   r   s     r7   r   z'_multi_tensor_nadam.<locals>.<listcomp>  sU         *T*:*:Q*>.)P QRRT  r8   c                 l    g | ]0\  }}t                    d |z
  z  d t          |          z
  z  dz  1S r-   r   )r   r?   r   r%   s      r7   r   z'_multi_tensor_nadam.<locals>.<listcomp>+  sT       &
B  ^^sRx0C*Z:P:P4PQUWW  r8   c                 l    g | ]0\  }}t                    |z  d t          |          |z  z
  z  dz  1S r   r   )r   r?   r   r%   s      r7   r   z'_multi_tensor_nadam.<locals>.<listcomp>1  sb        ,
G #2!"J!7!7'!AAC   r8   ) rE   rF   r{   r|   r   allzipr   "_group_tensors_by_device_and_dtypevaluesr   r   r   r   _foreach_negis_cpu_foreach_add_rI   _foreach_mul__foreach_add_foreach_lerp__foreach_addcmul__foreach_sqrt_foreach_mul_foreach_pow_foreach_sub__foreach_neg__foreach_sqrt__foreach_div__foreach_sub_foreach_addcdiv_r   ))r$   r^   r_   r`   ra   rb   rf   rg   r%   r(   r)   r'   r*   r!   r"   r#   rc   grouped_tensorsgrouped_params_grouped_grads_grouped_exp_avgs_grouped_exp_avg_sqs_grouped_mu_products_grouped_state_steps__grouped_paramsgrouped_gradsgrouped_exp_avgsgrouped_exp_avg_sqsgrouped_mu_productsgrouped_state_stepsexp_avg_sq_sqrtexponentmusmu_nextsbias_correction_sqrtr   step_size_gradsstep_size_expavg	numeratorr   s)         ``` `                             @r7   _multi_tensor_nadamr   u  s)   ( 6{{aDDDDD >&&(( EZ E'H(
 (
 (
$  
 
 
 
  #6;DD
 
 
 
 
 	E 	E E  fB  E  E  E		E 	E 
  B	+{KH O ""$$e e 		 	d6lO<<T&\>::V.?@@"4<1EFF"4<1EFF"4<1EFF  	/?AT    	>!.}==M ~**,, 	81DQ1G1N 	8#U\#e%D%D%DC      3Q7771% #NA\8I4IJJJJ  '%~\     %*$6%~\% % %M
 	-}a%iHHH/777q5y	
 	
 	
  -.ABB
  #	)*=~NNH$T844CT***S)))U+++ .999)$99H$///#...%000 #(#5e=P#Q#Q  4c::: 4555 !56666$ $ $ $CV$ $ $     /  C     0  H 	/555O-ABBBOS111 ! 8	S)))R(((&':C@@E&&&U+++!O &':HEEE"--- s+++%000' *?MJJI#I/?AQRRR #NIOOOO1   *-.A3*G*G   O  3    033F/Q/Q  
  
  #   # 0/CS   Ge er8   )single_tensor_fnFr    c                   t          d |D                       st          d          t          d |D                       st          d          |t          | |	d          \  }}|r-t          j                                        rt          d          |r&t          j                                        st          }nt          } || |||||||||||||||	|
	           dS )
zpFunctional API that performs NAdam algorithm computation.

    See :class:`~torch.optim.NAdam` for details.
    c              3   J   K   | ]}t          |t          j                  V  d S rl   r.   rF   r   r   ts     r7   r   znadam.<locals>.<genexpr>`  .      @@qz!U\**@@@@@@r8   zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsc              3   J   K   | ]}t          |t          j                  V  d S rl   r   r   s     r7   r   znadam.<locals>.<genexpr>e  r   r8   zPAPI has changed, `mu_products` argument must contain a list of singleton tensorsNF)	use_fusedz6torch.jit.script not supported with foreach optimizers)rf   rg   r%   r(   r)   r!   r*   r'   r"   r#   rc   )r   rX   r   rF   jitis_scriptingr   r   )r$   r^   r_   r`   ra   rb   r*   r    r"   r#   rc   r!   rf   rg   r%   r(   r)   r'   r   funcs                       r7   r   r   D  sC   8 @@K@@@@@ 
^
 
 	
 @@K@@@@@ 
^
 
 	
 1Ne
 
 

7  U59))++ USTTT $uy--// $"#D!%5%#     r8   )FNFFFF) __doc__typingr   r   r   r   r   rF   r   	optimizerr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __all__r   rH   rp   r   r   r   rP   r8   r7   <module>r      s   / . 5 5 5 5 5 5 5 5 5 5 5 5 5 5                                        & G
s s s s sI s s sn&N	  
  
  
  
  O= D[L[<[ 6l[ f	[
 f[ f[ [ [ 	[ [ [ 
[ ![ [  ![" #[$ %[ [ [ [|LLL<L 6lL f	L
 fL fL L L 	L L L 
L !L L  !L" #L$ %L L L L^  1EFFF $)" D DLD<D 6lD f	D
 fD fD !D d^D D D D D  !D" #D$ 	%D& 'D( )D* 
+D D D GFD D Dr8   