
    wiO                     8   d dl Z d dlZd dlmZ d dlZd dlmZ dde	fdZ
ej        j        j        ddfde	de	fdZ G d d	ej                  Z G d
 dej                  Z G d dej                  Z G d dej                  Z G d dej                  ZdS )    N   key_chunk_sizec                     j         dd         \  }j         d         t          |           t          j                  z   t	          j        t          j        d          fd             fd}t          j        	                    |t          j
        d|          	          \  }}}	t          j        |	dd
          }
t          j        |	|
z
            }|t          j        |d          z  }||z  }|                    d          }t          j        |d                              d          }||z  S )zBMulti-head dot product attention with a limited number of queries.NF)prevent_csec                 Z   t          j        d| |          }t          j        |dd          }t          j                            |          }t          j        ||z
            }t          j        d||          }t          j        d|          }||                    d          |fS )	Nz...qhd,...khd->...qhk)	precisionr   Taxiskeepdimsz...vhf,...qhv->...qhfz...qhk->...qhr   )jnpeinsummaxjaxlaxstop_gradientexpsum)querykeyvalueattn_weights	max_scoreexp_weights
exp_valuesr
   s          o/root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/diffusers/models/attention_flax.pysummarize_chunkz/_query_chunk_attention.<locals>.summarize_chunk   s    z"95#QZ[[[GLrDAAA	G)))44	glY677Z 7W`aaa
J	::	KOOO44i@@    c           	      n   t           j                            dgj        dz
  z  | ddgz   t	          j        d d                   gz             }t           j                            
dg
j        dz
  z  | ddgz   t	          
j        d d                   	gz             } ||          S )Nr      r   operandstart_indicesslice_sizes)r   r   dynamic_slicendimlistshape)	chunk_idx	key_chunkvalue_chunk
k_featuresr   r   	num_headsr   r   
v_featuresr   s      r   chunk_scannerz-_query_chunk_attention.<locals>.chunk_scanner+   s    G))#A.)Q1BBSYss^,,	:/VV * 
 
	 g++#a0Iq!3DDU["-...)Z1XX , 
 
 ui===r    r   )fxsTr   r   )r*   minr   sqrt	functoolspartialr   
checkpointr   maparanger   r   expand_dimsr   )r   r   r   r
   r   num_kvr1   chunk_valueschunk_weights	chunk_max
global_max	max_diffs
all_valuesall_weightsr.   r/   r   r0   s   `````         @@@@r   _query_chunk_attentionrD      s   $'IbccN!FIzRJ00NCHZ(((Es~5999
A 
A 
A 
A :9
A> > > > > > > > > > > >" .1W[[=SZXY[acqMrMr[-s-s*L-T:::J	J.//ICOIB7777LYM!!q!))J/-4488a8@@K##r    i   query_chunk_sizec           	          	
  j         dd         \  
		
 f	d}t          j                            |ddt	          j        
z                      \  }}t          j        |d          S )a  
    Flax Memory-efficient multi-head dot product attention. https://arxiv.org/abs/2112.05682v2
    https://github.com/AminRezaei0x443/memory-efficient-attention

    Args:
        query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
        key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
        value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
        precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
            numerical precision for computation
        query_chunk_size (`int`, *optional*, defaults to 1024):
            chunk size to divide query array value must divide query_length equally without remainder
        key_chunk_size (`int`, *optional*, defaults to 4096):
            chunk size to divide key and value array value must divide key_value_length equally without remainder

    Returns:
        (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
    r   Nc           	         	 t           j                            	dg	j        dz
  z  | ddgz   t	          	j        d d                   t          
          gz             }| 
z   t          |          fS )Nr   r"   r   r#   )r   r   r   r
   r   )r   r   r'   r(   r)   r*   r4   rD   )r+   _query_chunkr   r   r/   num_qr
   
q_featuresr   rE   r   s      r   r1   z5jax_memory_efficient_attention.<locals>.chunk_scannera   s    g++3%*q.1iA5FFU["-..#6F2N2NPY[e1ff , 
 
 (("!s%9]k  
 	
r    r   )r2   initr3   lengthr   )r*   r   r   scanmathceilr   concatenate)r   r   r   r
   rE   r   r1   rH   resr/   rJ   rK   s   ``````   @@@r   jax_memory_efficient_attentionrS   J   s    * $);rss#3 E9j
 
 
 
 
 
 
 
 
 
 
 
 
 W\\
y!1122	   FAs ?3R((((r    c                       e Zd ZU dZeed<   dZeed<   dZeed<   dZe	ed<   d	Z
eed
<   d	Zeed<   ej        Zej        ed<   d Zd Zd ZddZdS )FlaxAttentiona   
    A Flax multi-head attention module as described in: https://arxiv.org/abs/1706.03762

    Parameters:
        query_dim (:obj:`int`):
            Input hidden states dimension
        heads (:obj:`int`, *optional*, defaults to 8):
            Number of heads
        dim_head (:obj:`int`, *optional*, defaults to 64):
            Hidden states dimension inside each head
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://arxiv.org/abs/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`

    	query_dim   heads@   dim_head        dropoutFuse_memory_efficient_attentionsplit_head_dimdtypec                    | j         | j        z  }| j         dz  | _        t          j        |d| j        d          | _        t          j        |d| j        d          | _        t          j        |d| j        d          | _        t          j        | j	        | j        d          | _
        t          j        | j        	          | _        d S )
Ng      Fto_q)use_biasr_   nameto_kto_vto_out_0)r_   rc   rate)rZ   rX   scalennDenser_   r   r   r   rV   	proj_attnDropoutr\   dropout_layerself	inner_dims     r   setupzFlaxAttention.setup   s    MDJ.	]D(
 Xi%tzPVWWW
8ITZfUUUXi%tzPVWWW
$.
TTTZT\:::r    c                     |j         \  }}}| j        }|                    |||||z            }t          j        |d          }|                    ||z  |||z            }|S N)r         r"   r*   rX   reshaper   	transposerp   tensor
batch_sizeseq_lendim	head_sizes         r   reshape_heads_to_batch_dimz(FlaxAttention.reshape_heads_to_batch_dim   sh    #)< 
GSJ	
GYy@PQQv|44
Y 6	AQRRr    c                     |j         \  }}}| j        }|                    ||z  |||          }t          j        |d          }|                    ||z  |||z            }|S rt   rw   rz   s         r   reshape_batch_dim_to_headsz(FlaxAttention.reshape_batch_dim_to_heads   sg    #)< 
GSJ	
i 7GSQQv|44
i 7#	/RRr    NTc                    ||n|}|                      |          }|                     |          }|                     |          }| j        rw|j        d         }t          j        ||d| j        | j        f          }t          j        ||d| j        | j        f          }	t          j        ||d| j        | j        f          }
n?| 	                    |          }| 	                    |          }	| 	                    |          }
| j
        r|                    ddd          }|	                    ddd          }	|
                    ddd          }
|j        d         }|dz  dk    rt          |dz            }nG|dz  dk    rt          |dz            }n+|dz  dk    rt          |dz            }nt          |          }t          ||	|
|d	
          }|                    ddd          }n| j        rt          j        d|	|          }nt          j        d||	          }|| j        z  }t!          j        || j        rdnd          }| j        rIt          j        d||
          }|j        d         }t          j        ||d| j        | j        z  f          }n+t          j        d||
          }|                     |          }|                     |          }|                     ||          S )Nr   r   rv   ru   r   rY         i @  )rE   r   zb t n h, b f n h -> b n f tzb i d, b j d->b i jr   zb n f t, b t n h -> b f n hzb i j, b j d -> b i ddeterministic)r   r   r   r^   r*   r   rx   rX   rZ   r   r]   ry   intrS   r   ri   rj   softmaxr   rl   rn   )rp   hidden_statescontextr   
query_projkey_proj
value_projbquery_states
key_statesvalue_statesflatten_latent_dimrE   attention_scoresattention_probss                  r   __call__zFlaxAttention.__call__   s$   #*?--ZZ..
88G$$ZZ((
 	G#A&A;zAr4:t}3UVVLX2tz4=/QRRJ;zAr4:t}3UVVLL:::FFL88BBJ:::FFL. (	O'11!Q::L#--aA66J'11!Q::L
 ".!3B!7!B&!++#&'9B'>#?#?  #b(A--#&'9B'>#?#?  #a'1,,#&'9A'=#>#>  #&'9#:#: :j,IYjr  M *33Aq!<<MM " _#&:.KZYe#f#f  #&:.C\S]#^#^ /$*< j)9dFY@`_`aaaO " O #
+H/[g h h!'* #MAr4:PTP]C];^ _ _ #
+BOUa b b $ ? ? N N}55!!-}!MMMr    )NT)__name__
__module____qualname____doc__r   __annotations__rX   rZ   r\   floatr]   boolr^   r   float32r_   rr   r   r   r    r    r   rU   rU   z   s          , NNNE3NNNHcGU+0"D000 ND   {E39"""
; 
; 
;    <N <N <N <N <N <Nr    rU   c                       e Zd ZU dZeed<   eed<   eed<   dZeed<   dZe	ed<   e
j        Ze
j        ed	<   dZe	ed
<   dZe	ed<   d ZddZdS )FlaxBasicTransformerBlockae  
    A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
    https://arxiv.org/abs/1706.03762


    Parameters:
        dim (:obj:`int`):
            Inner hidden states dimension
        n_heads (:obj:`int`):
            Number of heads
        d_head (:obj:`int`):
            Hidden states dimension inside each head
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        only_cross_attention (`bool`, defaults to `False`):
            Whether to only apply cross attention.
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://arxiv.org/abs/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    r~   n_headsd_headr[   r\   Fonly_cross_attentionr_   r]   r^   c           	      H   t          | j        | j        | j        | j        | j        | j        | j                  | _        t          | j        | j        | j        | j        | j        | j        | j                  | _	        t          | j        | j        | j                  | _        t          j        d| j                  | _        t          j        d| j                  | _        t          j        d| j                  | _        t          j        | j                  | _        d S )Nr_   )r~   r\   r_   h㈵>)epsilonr_   rg   )rU   r~   r   r   r\   r]   r^   r_   attn1attn2FlaxFeedForwardffrj   	LayerNormnorm1norm2norm3rm   rn   rp   s    r   rr   zFlaxBasicTransformerBlock.setup  s    "HLKL/*
 
 

 #HLKL/*
 
 

 "dhDJWWW\$djAAA
\$djAAA
\$djAAA
ZT\:::r    Tc                    |}| j         r,|                     |                     |          ||          }n*|                     |                     |          |          }||z   }|}|                     |                     |          ||          }||z   }|}|                     |                     |          |          }||z   }|                     ||          S Nr   )r   r   r   r   r   r   r   rn   )rp   r   r   r   residuals        r   r   z"FlaxBasicTransformerBlock.__call__2  s     $ 	_ JJtzz-'@'@'YfJggMM JJtzz-'@'@P]J^^M%0 !

4::m#<#<gUb
cc%0 !

= 9 9WW%0!!-}!MMMr    NT)r   r   r   r   r   r   r\   r   r   r   r   r   r_   r]   r^   rr   r   r   r    r   r   r      s          2 
HHHLLLKKKGU!&$&&&{E39"""+0"D000 ND   ; ; ;6N N N N N Nr    r   c                       e Zd ZU dZeed<   eed<   eed<   dZeed<   dZeed<   d	Z	e
ed
<   d	Ze
ed<   ej        Zej        ed<   d	Ze
ed<   d	Ze
ed<   d ZddZdS )FlaxTransformer2DModela  
    A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
    https://arxiv.org/pdf/1506.02025.pdf


    Parameters:
        in_channels (:obj:`int`):
            Input number of channels
        n_heads (:obj:`int`):
            Number of heads
        d_head (:obj:`int`):
            Hidden states dimension inside each head
        depth (:obj:`int`, *optional*, defaults to 1):
            Number of transformers block
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        use_linear_projection (`bool`, defaults to `False`): tbd
        only_cross_attention (`bool`, defaults to `False`): tbd
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://arxiv.org/abs/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    in_channelsr   r   rv   depthr[   r\   Fuse_linear_projectionr   r_   r]   r^   c                     t          j        dd           _         j         j        z   j        r!t          j         j                   _        n#t          j	        ddd j                   _         fdt           j                  D              _         j        r!t          j         j                   _        n#t          j	        ddd j                   _        t          j         j        	           _        d S )
N    r   )
num_groupsr   r   )rv   rv   VALID)kernel_sizestridespaddingr_   c                     g | ]<}t          j        j        j        j        j        j        j                   =S ))r\   r   r_   r]   r^   )r   r   r   r\   r   r_   r]   r^   ).0rH   rq   rp   s     r   
<listcomp>z0FlaxTransformer2DModel.setup.<locals>.<listcomp>~  sc     #
 #
 #
  &%)%>j/3/R#2	 	 	#
 #
 #
r    rg   )rj   	GroupNormnormr   r   r   rk   r_   proj_inConvranger   transformer_blocksproj_outrm   r\   rn   ro   s   `@r   rr   zFlaxTransformer2DModel.setupo  s!   LB===	L4;.	% 		8ITZ@@@DLL7"j  DL#
 #
 #
 #
 #
 4:&&#
 #
 #
 % 		HYdjAAADMMG"j  DM  ZT\:::r    Tc                 @   |j         \  }}}}|}|                     |          }| j        r0|                    |||z  |          }|                     |          }n/|                     |          }|                    |||z  |          }| j        D ]}	 |	|||          }| j        r.|                     |          }|                    ||||          }n-|                    ||||          }|                     |          }||z   }|                     ||          S r   )r*   r   r   rx   r   r   r   rn   )
rp   r   r   r   batchheightwidthchannelsr   transformer_blocks
             r   r   zFlaxTransformer2DModel.__call__  sB   )6)<&vuh 		-00% 	S)11%%RRM LL77MM LL77M)11%%RRM!%!8 	c 	c--mWTabbbMM% 	9 MM-88M)11%QQMM)11%QQM MM-88M%0!!-}!MMMr    Nr   )r   r   r   r   r   r   r   r\   r   r   r   r   r   r   r_   r]   r^   rr   r   r   r    r   r   r   H  s          6 LLLKKKE3NNNGU"'4'''!&$&&&{E39"""+0"D000 ND   (; (; (;TN N N N N Nr    r   c                   \    e Zd ZU dZeed<   dZeed<   ej	        Z
ej
        ed<   d Zd
dZd	S )r   a  
    Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
    [`FeedForward`] class, with the following simplifications:
    - The activation function is currently hardcoded to a gated linear unit from:
    https://arxiv.org/abs/2002.05202
    - `dim_out` is equal to `dim`.
    - The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].

    Parameters:
        dim (:obj:`int`):
            Inner hidden states dimension
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
    r~   r[   r\   r_   c                     t          | j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nr   )	FlaxGEGLUr~   r\   r_   net_0rj   rk   net_2r   s    r   rr   zFlaxFeedForward.setup  s;     txtzBB
Xdhdj999


r    Tc                 ^    |                      ||          }|                     |          }|S r   )r   r   )rp   r   r   s      r   r   zFlaxFeedForward.__call__  s.    

=
NN

=11r    Nr   r   r   r   r   r   r   r\   r   r   r   r_   rr   r   r   r    r   r   r     sr          " 
HHHGU{E39""": : :     r    r   c                   \    e Zd ZU dZeed<   dZeed<   ej	        Z
ej
        ed<   d Zd
dZd	S )r   a  
    Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
    https://arxiv.org/abs/2002.05202.

    Parameters:
        dim (:obj:`int`):
            Input hidden states dimension
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
    r~   r[   r\   r_   c                     | j         dz  }t          j        |dz  | j                  | _        t          j        | j                  | _        d S )Nr   ru   r   rg   )r~   rj   rk   r_   projrm   r\   rn   ro   s     r   rr   zFlaxGEGLU.setup  sF    HqL	HY]$*===	ZT\:::r    Tc                     |                      |          }t          j        |dd          \  }}|                     |t	          j        |          z  |          S )Nru   r   r   )r   r   splitrn   rj   gelu)rp   r   r   hidden_linearhidden_gelus        r   r   zFlaxGEGLU.__call__  sW    		-00%(Y}aa%H%H%H"{!!-"'+2F2F"FVc!dddr    Nr   r   r   r    r   r   r     sx           
HHHGU{E39"""; ; ;
e e e e e er    r   )r   )r6   rO   
flax.linenlinenrj   r   	jax.numpynumpyr   r   rD   r   	PrecisionHIGHESTrS   ModulerU   r   r   r   r   r   r    r   <module>r      s              



      0$ 0$ 0$ 0$ 0$ 0$h "%!2!:TXpt-) -)NQ-)jm-) -) -) -)`wN wN wN wN wNBI wN wN wNtQN QN QN QN QN	 QN QN QNhgN gN gN gN gNRY gN gN gNT    bi   De e e e e	 e e e e er    