
    wi
                        d dl Z d dlmZmZ d dlZd dlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ  ej        e          Zd ZddZ G d dej                  Z G d dej                  Z  G d dej                  Z! G d dej                  Z" G d de
e          Z# G d de
e          Z$dS )     N)OptionalUnion)nn   )ConfigMixinregister_to_config)
ModelMixin)FeedForward)	Attention)TimestepEmbedding	Timestepsget_2d_sincos_pos_embed)Transformer2DModelOutput)AdaLayerNorm)loggingc                 &   d }||d|z  z
  k     s||d|z  z   k    rt                               d           t          j                    5   |||z
  |z            } |||z
  |z            }|                     d|z  dz
  d|z  dz
             |                                  |                     |t          j        d          z             | 	                    |           | 
                    ||           | cd d d            S # 1 swxY w Y   d S )Nc                 `    dt          j        | t          j        d          z            z   dz  S )N      ?       @)matherfsqrt)xs    }/root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/diffusers/pipelines/unidiffuser/modeling_uvit.pynorm_cdfz(_no_grad_trunc_normal_.<locals>.norm_cdf   s)    dhq49S>>1222c99       zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.   r   )minmax)loggerwarningtorchno_graduniform_erfinv_mul_r   r   add_clamp_)tensormeanstdabr   lus           r   _no_grad_trunc_normal_r1      s}   : : : 	q1s7{q1s7{ 2 2;	
 	
 	

 
   Ha$h#%&&Ha$h#%&& 	A	1q519--- 	 	C$)C..()))D 	!###+                 s   	B0DD
D
        r          r   c                 (    t          | ||||          S )a  Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean},
    \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for
    generating the random values works best when :math:`a \leq \text{mean} \leq b`.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    Examples:
        >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)
    )r1   )r*   r+   r,   r-   r.   s        r   trunc_normal_r5   9   s      "&$Q:::r   c                   <     e Zd ZdZ	 	 	 	 	 	 	 	 	 d
 fd	Zd	 Z xZS )
PatchEmbedz2D Image to Patch Embedding      r      FTc
                    t                                                       ||z  ||z  z  }
|| _        || _        t	          j        ||||f||          | _        |rt	          j        |dd          | _        nd | _        |	| _	        | j	        rqt          |t          |
dz                      }|                     dt          j        |                                                              d          d           d S d S )	N)kernel_sizestridebiasFgư>)elementwise_affineeps      ?	pos_embedr   )
persistent)super__init__flatten
layer_normr   Conv2dproj	LayerNormnormuse_pos_embedr   intregister_bufferr#   
from_numpyfloat	unsqueeze)selfheightwidth
patch_sizein_channels	embed_dimrG   rF   r>   rL   num_patchesrB   	__class__s               r   rE   zPatchEmbed.__init__O   s    	+0CD$IZ0HQ[bf
 
 
	  	Y5dSSSDIIDI* 	r/	3{C?O;P;PQQI  e.>y.I.I.O.O.Q.Q.[.[\].^.^kp qqqqq	r 	rr   c                     |                      |          }| j        r)|                    d                              dd          }| j        r|                     |          }| j        r
|| j        z   S |S )Nr   r   )rI   rF   	transposerG   rK   rL   rB   )rR   latents     r   forwardzPatchEmbed.forwardn   sv    6""< 	7^^A&&00A66F? 	'YYv&&F 	DN**Mr   )	r8   r8   r9   r   r:   FTTT)__name__
__module____qualname____doc__rE   r]   __classcell__rY   s   @r   r7   r7   L   su        %% r r r r r r>	 	 	 	 	 	 	r   r7   c                   *     e Zd Zdef fdZd Z xZS )	SkipBlockdimc                     t                                                       t          j        d|z  |          | _        t          j        |          | _        d S )Nr   )rD   rE   r   Linearskip_linearrJ   rK   )rR   rf   rY   s     r   rE   zSkipBlock.__init__{   sH    9QWc22 L%%			r   c                     |                      t          j        ||gd                    }|                     |          }|S )Nrf   )ri   r#   catrK   )rR   r   skips      r   r]   zSkipBlock.forward   s<    UY4yb999::IIaLLr   )r^   r_   r`   rM   rE   r]   rb   rc   s   @r   re   re   z   sS        &C & & & & & &      r   re   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 dded	ed
edee         dedee         dedededededededef fdZ	 	 	 	 	 	 ddZ	 xZ
S )UTransformerBlockaS  
    A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations.

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to be used in feed-forward.
        num_embeds_ada_norm (:obj: `int`, *optional*):
            The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (:obj: `bool`, *optional*, defaults to `False`):
            Configure if the attentions should contain a bias parameter.
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used.
        double_self_attention (`bool`, *optional*):
            Whether to use two self-attention layers. In this case no cross attention layers are used.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the query and key to float32 when performing the attention calculation.
        norm_elementwise_affine (`bool`, *optional*):
            Whether to use learnable per-element affine parameters during layer normalization.
        norm_type (`str`, defaults to `"layer_norm"`):
            The layer norm implementation to use.
        pre_layer_norm (`bool`, *optional*):
            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
            as opposed to after ("post-LayerNorm"). Note that `BasicTransformerBlock` uses pre-LayerNorm, e.g.
            `pre_layer_norm = True`.
        final_dropout (`bool`, *optional*):
            Whether to use a final Dropout layer after the feedforward network.
    r2   NgegluFTrG   rf   num_attention_headsattention_head_dimcross_attention_dimactivation_fnnum_embeds_ada_normattention_biasonly_cross_attentiondouble_self_attentionupcast_attentionnorm_elementwise_affine	norm_typepre_layer_normfinal_dropoutc           	         t                                                       |	| _        |d uo|dk    | _        || _        |dv r|t          d| d| d          t          ||||||	r|nd |          | _        ||
r t          ||
s|nd |||||          | _        nd | _        | j        rt          ||          | _
        nt          j        ||          | _
        ||
r3| j        rt          ||          nt          j        ||          | _        nd | _        t          j        ||          | _        t          ||||	          | _        d S 
Nada_norm)r   ada_norm_zeroz`norm_type` is set to zw, but `num_embeds_ada_norm` is not defined. Please make sure to define `num_embeds_ada_norm` if setting `norm_type` to .)	query_dimheadsdim_headdropoutr>   rt   rz   )r   rt   r   r   r   r>   rz   )r?   )r   ru   r~   rD   rE   rx   use_ada_layer_normr}   
ValueErrorr   attn1attn2r   norm1r   rJ   norm2norm3r
   ffrR   rf   rr   rs   r   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   rY   s                   r   rE   zUTransformerBlock.__init__      $ 	$8!#6d#B"_	U_H_,555:M:UX X XKTX X X   %'7K U 3 3QU-
 
 

 *.C*"?T$^$7$7Z^)+#!1  DJJ DJ" 	W%c+>??DJJc>UVVVDJ*.C* *SS"5666\#:QRRR JJ DJ \#:QRRR
c7-_lmmmr   c                    | j         r4| j        r|                     ||          }n|                     |          }n|}||ni } | j        |f| j        r|nd |d|}	| j         s3| j        r|                     |	|          }	n|                     |	          }	|	|z   }| j        | j         r3| j        r|                     ||          n|                     |          }n|} | j        |f||d|}	| j         s2| j        r|                     |	|          n|                     |	          }	|	|z   }| j         r|                     |          }n|}|                     |          }
| j         s|                     |
          }
|
|z   }|S N)encoder_hidden_statesattention_mask	r}   r   r   r   rx   r   r   r   r   )rR   hidden_statesr   r   encoder_attention_masktimestepcross_attention_kwargsclass_labelsnorm_hidden_statesattn_output	ff_outputs              r   r]   zUTransformerBlock.forward   s     	/& ?%)ZZx%H%H""%)ZZ%>%>""!. <R;]!7!7ce dj
;?;T"^"7"7Z^)
 
 %	
 
 " 	6& 6"jjh??"jj55#m3:!" 3;?;RqDJJ}h777X\XbXbcpXqXq #" &3"
 %$*"&;5  )	 K & xCGCZwdjjh???`d`j`jkv`w`w'-7M  	/!%M!:!:!.GG.//	 " 	.

9--I!M1r   )r2   Nrq   NFFFFTrG   TFNNNNNNr^   r_   r`   ra   rM   r   strboolrE   r]   rb   rc   s   @r   rp   rp      s`        L -1$-1$%*&+!&(,%##!Kn KnKn !Kn  	Kn &c]Kn Kn &c]Kn Kn #Kn  $Kn Kn "&Kn Kn Kn  !Kn Kn Kn Kn Kn Kn` "##M M M M M M M Mr   rp   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 dded	ed
edee         dedee         dedededededededef fdZ	 	 	 	 	 	 ddZ	 xZ
S )UniDiffuserBlocka@	  
    A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations and puts the
    LayerNorms on the residual backbone of the block. This matches the transformer block in the [original UniDiffuser
    implementation](https://github.com/thu-ml/unidiffuser/blob/main/libs/uvit_multi_post_ln_v1.py#L104).

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to be used in feed-forward.
        num_embeds_ada_norm (:obj: `int`, *optional*):
            The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (:obj: `bool`, *optional*, defaults to `False`):
            Configure if the attentions should contain a bias parameter.
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used.
        double_self_attention (`bool`, *optional*):
            Whether to use two self-attention layers. In this case no cross attention layers are used.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the query and key to float() when performing the attention calculation.
        norm_elementwise_affine (`bool`, *optional*):
            Whether to use learnable per-element affine parameters during layer normalization.
        norm_type (`str`, defaults to `"layer_norm"`):
            The layer norm implementation to use.
        pre_layer_norm (`bool`, *optional*):
            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
            (`pre_layer_norm = False`).
        final_dropout (`bool`, *optional*):
            Whether to use a final Dropout layer after the feedforward network.
    r2   Nrq   FTrG   rf   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   c           	         t                                                       |	| _        |d uo|dk    | _        || _        |dv r|t          d| d| d          t          ||||||	r|nd |          | _        ||
r t          ||
s|nd |||||          | _        nd | _        | j        rt          ||          | _
        nt          j        ||          | _
        ||
r3| j        rt          ||          nt          j        ||          | _        nd | _        t          j        ||          | _        t          ||||	          | _        d S r   r   r   s                   r   rE   zUniDiffuserBlock.__init__r  r   r   c                    | j         r3| j        r|                     ||          }n|                     |          }||ni } | j        |f| j        r|nd |d|}||z   }| j         s3| j        r|                     ||          }n|                     |          }| j        | j         r2| j        r|                     ||          n|                     |          } | j        |f||d|}||z   }| j         s2| j        r|                     ||          n|                     |          }| j         r|                     |          }|                     |          }	|	|z   }| j         s|                     |          }|S r   r   )
rR   r   r   r   r   r   r   r   r   r   s
             r   r]   zUniDiffuserBlock.forward  s     	:& : $

=( C C $

= 9 9 <R;]!7!7ce dj
;?;T"^"7"7Z^)
 
 %	
 
 $m3
 " 	:& : $

=( C C $

= 9 9:!" ;?;RqDJJ}h777X\XbXbcpXqXq  %$*&;5  )	 K (-7M & ;?;RqDJJ}h777X\XbXbcpXqXq   	6 JJ}55MGGM**	!M1 " 	6 JJ}55Mr   )r2   Nrq   NFFFFTrG   FTr   r   rc   s   @r   r   r   N  s`       ! !P -1$-1$%*&+!&(,%$"!Kn KnKn !Kn  	Kn &c]Kn Kn &c]Kn Kn #Kn  $Kn Kn "&Kn Kn Kn  !Kn Kn Kn Kn Kn Kn` "##M M M M M M M Mr   r   c            .       4    e Zd ZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d)dededee         dee         dedededee         dedee         dee         dee         de	dee         dededede	d e	d!ed"ed#ef, fd$            Z
	 	 	 	 	 	 	 d*d%ed&ed'efd(Z xZS )+UTransformer2DModelay  
    Transformer model based on the [U-ViT](https://github.com/baofff/U-ViT) architecture for image-like data. Compared
    to [`Transformer2DModel`], this model has skip connections between transformer blocks in a "U"-shaped fashion,
    similar to a U-Net. Supports only continuous (actual embeddings) inputs, which are embedded via a [`PatchEmbed`]
    layer and then reshaped to (b, t, d).

    Parameters:
        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            Pass if the input is continuous. The number of channels in the input.
        out_channels (`int`, *optional*):
            The number of output channels; if `None`, defaults to `in_channels`.
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        norm_num_groups (`int`, *optional*, defaults to `32`):
            The number of groups to use when performing Group Normalization.
        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
        attention_bias (`bool`, *optional*):
            Configure if the TransformerBlocks' attention should contain a bias parameter.
        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
            `ImagePositionalEmbeddings`.
        num_vector_embeds (`int`, *optional*):
            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
            Includes the class for the masked latent pixel.
        patch_size (`int`, *optional*, defaults to 2):
            The patch size to use in the patch embedding.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
            up to but not more than steps than `num_embeds_ada_norm`.
        use_linear_projection (int, *optional*): TODO: Not used
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used in each
            transformer block.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the query and key to float() when performing the attention calculation.
        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
            The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`.
        block_type (`str`, *optional*, defaults to `"unidiffuser"`):
            The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual
            backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard
            behavior in `diffusers`.)
        pre_layer_norm (`bool`, *optional*):
            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
            (`pre_layer_norm = False`).
        norm_elementwise_affine (`bool`, *optional*):
            Whether to use learnable per-element affine parameters during layer normalization.
        use_patch_pos_embed (`bool`, *optional*):
            Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`).
        final_dropout (`bool`, *optional*):
            Whether to use a final Dropout layer after the feedforward network.
    r9   X   Nr   r2       Fr   rq   rG   unidiffuserTrr   rs   rV   out_channels
num_layersr   norm_num_groupsrt   rw   sample_sizenum_vector_embedsrU   ru   rv   use_linear_projectionrx   rz   r|   
block_typer}   r{   ff_final_dropoutc                   	 t                                                       || _        | _        | _        z  ||
J d            |

J d            |
| _        |
| _        || _        t          |
|
|||          | _	        |dk    rt          nt          t          j        	fdt          |dz            D                       | _         	          | _        t          j        	fdt          |dz            D                       | _        ||n|| _        t          j                  | _        d S )	Nz0Patch input requires in_channels and patch_size.z?UTransformer2DModel over patched input must provide sample_sizerS   rT   rU   rV   rW   rL   r   c                 B    g | ]} 	
           S )r   rt   ru   rv   rw   rx   rz   r|   r}   r{   r~    .0dru   rw   rs   	block_clsrt   r   r   	inner_dimr{   r|   rr   rv   rx   r}   rz   s     r   
<listcomp>z0UTransformer2DModel.__init__.<locals>.<listcomp>  sf       " ! 	'&#(;"/(;#1)=%5'#1,C"2    r   r   r   c                     g | ]=}t          j        t          	           	
           d          >S )r   )rn   block)r   
ModuleDictre   r   s     r   r   z0UTransformer2DModel.__init__.<locals>.<listcomp>  s       0 /  )%! ! "+%/.$+0C*70C+91E-=&/+94K*:" " "	    r   )rD   rE   r   rr   rs   rS   rT   rU   r7   rB   r   rp   r   
ModuleListrangetransformer_in_blockstransformer_mid_blocktransformer_out_blocksr   rJ   norm_out)rR   rr   rs   rV   r   r   r   r   rt   rw   r   r   rU   ru   rv   r   rx   rz   r|   r   r}   r{   use_patch_pos_embedr   r   r   rY   s    ``   ` ``   `` ``` `` `@@r   rE   zUTransformer2DModel.__init__M  sc   6 	%:"#6 "4'*<<	 &:+A+ACu+A+AA&&(i&&& " 
$#!#-
 
 
 &&(II)I%']                 " zQ//#  &
 &
", &/Y 3' 3)!5-)$;*&
 &
 &
"& ')m                 0 zQ//1  '
 '
#< ,8+?KK\ Y//r   return_dicthidden_states_is_embedding
unpatchifyc	           	         |s|rt          d| d| d| d          |s|                     |          }g }	| j        D ]'}
 |
|||||          }|	                    |           (|                     |          }| j        D ]<} |d         ||	                                          } |d         |||||          }=|                     |          }|rt          |j	        d         d	z            x}}|
                    d
||| j        | j        | j        f          }t          j        d|          }|
                    d
| j        || j        z  || j        z  f          }n|}|s|fS t          |          S )a  
        Args:
            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
                When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            timestep ( `torch.long`, *optional*):
                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
                Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
                conditioning.
            cross_attention_kwargs (*optional*):
                Keyword arguments to supply to the cross attention layers, if used.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
                tuple.
            hidden_states_is_embedding (`bool`, *optional*, defaults to `False`):
                Whether or not hidden_states is an embedding directly usable by the transformer. In this case we will
                ignore input handling (e.g. continuous, vectorized, etc.) and directly feed hidden_states into the
                transformer blocks.
            unpatchify (`bool`, *optional*, defaults to `True`):
                Whether to unpatchify the transformer output.

        Returns:
            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
            returning a tuple, the first element is the sample tensor.
        z!Cannot both define `unpatchify`: z and `return_dict`: z since when `unpatchify` is zy the returned output is of shape (batch_size, seq_len, hidden_dim) rather than (batch_size, num_channels, height, width).)r   r   r   r   rn   r   r   rA   rk   shapenhwpqc->nchpwq)sample)r   rB   r   appendr   r   popr   rM   r   reshaperU   r   r#   einsumr   )rR   r   r   r   r   r   r   r   r   skipsin_block	out_blockrS   rT   outputs                  r   r]   zUTransformer2DModel.forward  s%   T  	k 	JJ J JT_ J J$.J J J   * 	: NN=99M
 2 	( 	(H$H&;!'=)  M LL'''' 22=AA 4 	 	I-If-mUYY[[IIM.Ig.&;!'=)  MM m44  	# !4Q!73!>???FU)1165$/4?DL]^ 2  M "L)9=IIM"**4,ft.FPTP_H_` +  FF #F 	9'v6666r   )r9   r   NNr   r2   r   NFNNr   rq   NFFFrG   r   FTFF)NNNNTFT)r^   r_   r`   ra   r   rM   r   rP   r   r   rE   r]   rb   rc   s   @r   r   r     s       7 7r  $&"$%)&*!-1$%)+/$%$-1&+%*!&%'$(,!!&1H0 H0 H0  H0 c]	H0
 smH0 H0 H0 H0 &c]H0 H0 c]H0 $C=H0 SMH0 H0 &c]H0   $!H0" ##H0$ %H0& 'H0( )H0* +H0, "&-H00 1H0 H0 H0 H0 H0 H0Z ## +0f7 f7 f7 %)f7 f7 f7 f7 f7 f7 f7 f7 f7r   r   c            6           e Zd ZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d3dedededededee         dee         dedededee         dedee         dee         dee         de	d ee         d!ed"ed#ed$e	d%e	d&ed'ed(ed)ef4 fd*            Z
ej        j        d+             Z	 	 	 d4d,ej        d-ej        d.ej        d/eej        eef         d0eej        eef         d1eeej        eef                  fd2Z xZS )5UniDiffuserModela  
    Transformer model for a image-text [UniDiffuser](https://arxiv.org/pdf/2303.06555.pdf) model. This is a
    modification of [`UTransformer2DModel`] with input and output heads for the VAE-embedded latent image, the
    CLIP-embedded image, and the CLIP-embedded prompt (see paper for more details).

    Parameters:
        text_dim (`int`): The hidden dimension of the CLIP text model used to embed images.
        clip_img_dim (`int`): The hidden dimension of the CLIP vision model used to embed prompts.
        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            Pass if the input is continuous. The number of channels in the input.
        out_channels (`int`, *optional*):
            The number of output channels; if `None`, defaults to `in_channels`.
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        norm_num_groups (`int`, *optional*, defaults to `32`):
            The number of groups to use when performing Group Normalization.
        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
        attention_bias (`bool`, *optional*):
            Configure if the TransformerBlocks' attention should contain a bias parameter.
        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
            `ImagePositionalEmbeddings`.
        num_vector_embeds (`int`, *optional*):
            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
            Includes the class for the masked latent pixel.
        patch_size (`int`, *optional*, defaults to 2):
            The patch size to use in the patch embedding.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
            up to but not more than steps than `num_embeds_ada_norm`.
        use_linear_projection (int, *optional*): TODO: Not used
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used in each
            transformer block.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the query and key to float32 when performing the attention calculation.
        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
            The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`.
        block_type (`str`, *optional*, defaults to `"unidiffuser"`):
            The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual
            backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard
            behavior in `diffusers`.)
        pre_layer_norm (`bool`, *optional*):
            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
            (`pre_layer_norm = False`).
        norm_elementwise_affine (`bool`, *optional*):
            Whether to use learnable per-element affine parameters during layer normalization.
        use_patch_pos_embed (`bool`, *optional*):
            Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`).
        ff_final_dropout (`bool`, *optional*):
            Whether to use a final Dropout layer after the feedforward network.
        use_data_type_embedding (`bool`, *optional*):
            Whether to use a data type embedding. This is only relevant for UniDiffuser-v1 style models; UniDiffuser-v1
            is continue-trained from UniDiffuser-v0 on non-publically-available data and accepts a `data_type`
            argument, which can either be `1` to use the weights trained on non-publically-available data or `0`
            otherwise. This argument is subsequently embedded by the data type embedding, if used.
    r:      M   r9   r   Nr   r2   r   Frq   rG   r   Ttext_dimclip_img_dimnum_text_tokensrr   rs   rV   r   r   r   r   rt   rw   r   r   rU   ru   rv   r   rx   rz   r|   r   r}   r{   r   use_data_type_embeddingc                 @   t                                                       ||z  | _        |
J d            || _        || _        ||n|| _        || _        | j        |z  | j        |z  z  | _        t          ||||| j        |          | _	        t          j        || j                  | _        t          j        || j                  | _        t          | j        dd          | _        |r$t!          | j        d| j        z  | j                  nt          j                    | _        t          | j        dd          | _        |r$t!          | j        d| j        z  | j                  nt          j                    | _        || _        d|z   d	z   | j        z   | _        t          j        t1          j        d	| j        | j                            | _        t          j        |	
          | _        t;          | j        d           || _        | j        rQt          j        d| j                  | _         t          j        t1          j        d	d	| j                            | _!        tE          d$i d|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d|d |d!|d"|d#|| _#        |dz  |z  }t          j        | j        |          | _$        t          j        | j        |          | _%        t          j        | j        |          | _&        d S )%Nz<UniDiffuserModel over patched input must provide sample_sizer   Tr   )flip_sin_to_cosdownscale_freq_shift   )out_dimr   r   )pg{Gz?)r,   rr   rs   rV   r   r   r   r   rt   rw   r   r   rU   ru   rv   r   rx   rz   r|   r   r}   r{   r   r   r   )'rD   rE   r   r   rV   r   rU   rX   r7   
vae_img_inr   rh   clip_img_intext_inr   timestep_img_projr   Identitytimestep_img_embedtimestep_text_projtimestep_text_embedr   
num_tokens	Parameterr#   zerosrB   Dropoutpos_embed_dropr5   r   	Embeddingdata_type_token_embeddingdata_type_pos_embed_tokenr   transformervae_img_outclip_img_outtext_out)rR   r   r   r   rr   rs   rV   r   r   r   r   rt   rw   r   r   rU   ru   rv   r   rx   rz   r|   r   r}   use_timestep_embeddingr{   r   r   r   	patch_dimrY   s                                 r   rE   zUniDiffuserModel.__init__  s   @ 	 -/AA&&(f&&&&&+7+?KK\$ ,
:t?OS]?]^
 %!#n-
 
 
 9\4>BBy4>:: "+N !""
 "
 "
 &DN"     	 #,N !"#
 #
 #
 &DN"     	   //1A58HHek!T_dn&U&UVV j7333dn$//// (?$' 	]-/\!T^-L-LD*-/\%+aDN:[:[-\-\D* / 
 
 
 3 3
11
 $
 &	

 "z
 G
 ,O
 !4 3
 *>
 $
 0/
 "z
 (-
 !4 3
 #8"7
  "6!5!
" .-#
$  i%
& "z'
( *>)
* %<$;+
, !4 3-
. .-/
6  ]l2	9T^Y??IdnlCC	$.(;;r   c                     dhS )NrB   r   )rR   s    r   no_weight_decayz UniDiffuserModel.no_weight_decay
  s
    }r   latent_image_embedsimage_embedsprompt_embedstimestep_imgtimestep_text	data_typec	           
         |j         d         }	|                     |          }
|                     |          }|                     |          }|                    d          |
                    d          }}t          j        |          s't          j        |gt
          j        |
j	                  }|t          j
        |	|j        |j	                  z  }|                     |          }|                    | j                  }|                     |          }|                    d          }t          j        |          s't          j        |gt
          j        |
j	                  }|t          j
        |	|j        |j	                  z  }|                     |          }|                    | j                  }|                     |          }|                    d          }| j        r|
J d            t          j        |          s't          j        |gt
          j        |
j	                  }|t          j
        |	|j        |j	                  z  }|                     |                              d          }t          j        ||||||
gd          }nt          j        |||||
gd          }| j        rGt          j        | j        ddddddf         | j        | j        ddddddf         gd          }n| j        }||z   }|                     |          }|                     ||dd|d	d
d	          d         }| j        r%|                    ddd|d|fd          \  }}}}}}n"|                    dd|d|fd          \  }}}}}|                     |          }t'          |j         d         dz            x}}|                    d||| j        | j        | j        f          }t          j        d|          }|                    d| j        || j        z  || j        z  f          }|                      |          }| !                    |          }|||fS )am  
        Args:
            latent_image_embeds (`torch.Tensor` of shape `(batch size, latent channels, height, width)`):
                Latent image representation from the VAE encoder.
            image_embeds (`torch.Tensor` of shape `(batch size, 1, clip_img_dim)`):
                CLIP-embedded image representation (unsqueezed in the first dimension).
            prompt_embeds (`torch.Tensor` of shape `(batch size, seq_len, text_dim)`):
                CLIP-embedded text representation.
            timestep_img (`torch.long` or `float` or `int`):
                Current denoising step for the image.
            timestep_text (`torch.long` or `float` or `int`):
                Current denoising step for the text.
            data_type: (`torch.int` or `float` or `int`, *optional*, defaults to `1`):
                Only used in UniDiffuser-v1-style models. Can be either `1`, to use weights trained on nonpublic data,
                or `0` otherwise.
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            cross_attention_kwargs (*optional*):
                Keyword arguments to supply to the cross attention layers, if used.


        Returns:
            `tuple`: Returns relevant parts of the model's noise prediction: the first element of the tuple is tbe VAE
            image embedding, the second element is the CLIP image embedding, and the third element is the CLIP text
            embedding.
        r   r   )dtypedevice)r  rl   NzBdata_type must be supplied if the model uses a data type embeddingr   FT)r   r   r   r   r   r   r   rA   rk   r   r   )"r   r   r   r   sizer#   	is_tensorr*   longr  onesr  r   tor   rQ   r   r   r   rM   r   rm   rB   r   r   r   splitr   r   rU   r   r   r   r   )rR   r   r  r  r  r  r  r   r   
batch_sizevae_hidden_statesclip_hidden_statestext_hidden_statesr   num_img_tokenstimestep_img_tokentimestep_text_tokendata_type_tokenr   rB   t_img_token_outt_text_token_outdata_type_token_outr   img_clip_outimg_vae_outrS   rT   s                               r   r]   zUniDiffuserModel.forward  s   L ).q1
 !OO,?@@!--l;;!\\-88*<*A*A!*D*DFWF\F\]^F_F_ |,, 	k <ejQbQijjjL $ej<CU^j^q&r&r&rr!33LAA 0222DD!445GHH/99a9@@ }-- 	m!L-
SdSklllM &
:]EXanau(v(v(vv"55mDD 2444:4FF"667JKK1;;;BB ' 	((*n(((?9-- h!L)EIN_Nfggg	 "EJzYbYi$j$j$jjI"<<YGGQQVWQXXO!I&'#&&% 
 
 
MM "I#%8:LN`bst  M ' 	'	7U7AAA.0NPTP^_`_`_`bgbibiklklkl_lPmntu  II I%	1++M:: (("7#9'+ ) 	
 	
 	 ' 	 ##Q1oq.$QWX#YY # VcUhUhA>: Vi V VRO-x{ &&{33 [.q1S8999!))vudotHYZ * 
 
 l#3[AA!))t(&4?*BEDOD[\ * 
 
 ((66==**L(22r   )r:   r   r   r9   r   NNr   r2   r   NFNNNrq   NFFFrG   r   FFTFTF)r   NN)r^   r_   r`   ra   r   rM   r   rP   r   r   rE   r#   jitignorer   Tensorr   r]   rb   rc   s   @r   r   r   A  s       = =~  !#%"$%)&*!-1$%)+/$($-1&+%*!&%'$$(,!!%(-;F< F<F< F< 	F<
 !F<  F< c]F< smF< F< F< F< &c]F< F< c]F< $C=F<  SM!F<" #F<$ &c]%F<&  $'F<( #)F<* +F<, -F<. /F<0 1F<4 "&5F<8 9F<: "&;F< F< F< F< F< F<P Y   @A"#_3 _3"\_3 l_3 |	_3
 EL%45_3 U\5#56_3 E%,s":;<_3 _3 _3 _3 _3 _3 _3 _3r   r   )r2   r   r3   r   )%r   typingr   r   r#   r   configuration_utilsr   r   modelsr	   models.attentionr
   models.attention_processorr   models.embeddingsr   r   r   models.modeling_outputsr   models.normalizationr   utilsr   
get_loggerr^   r!   r1   r5   Moduler7   re   rp   r   r   r   r   r   r   <module>r*     sS    " " " " " " " "        B B B B B B B B             + + + + + + 3 3 3 3 3 3 V V V V V V V V V V ? ? ? ? ? ? 0 0 0 0 0 0       
	H	%	%" " "J; ; ; ;&+ + + + + + + +\    	   &| | | | |	 | | |B~ ~ ~ ~ ~ry ~ ~ ~Jk7 k7 k7 k7 k7*k k7 k7 k7\	l3 l3 l3 l3 l3z; l3 l3 l3 l3 l3r   