
    wi@                         d dl mZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZ d
dlmZmZmZ e G d de                      Z G d dee
          ZdS )    )	dataclass)OptionalTupleUnionN   )ConfigMixinregister_to_config)
BaseOutput   )GaussianFourierProjectionTimestepEmbedding	Timesteps)
ModelMixin   )UNetMidBlock2Dget_down_blockget_up_blockc                   (    e Zd ZU dZej        ed<   dS )UNet2DOutputz
    The output of [`UNet2DModel`].

    Args:
        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
            The hidden states output from the last layer of the model.
    sampleN)__name__
__module____qualname____doc__torchTensor__annotations__     n/root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/diffusers/models/unets/unet_2d.pyr   r      s,           Lr   r   c            7           e Zd ZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d6deeeeeef         f                  dedede	de
dede	dee
df         dee
df         deedf         ded ed!ed"e
d#e
d$ed%e
d&ee         d'ed(ee         d)ed*e
d+e	d,ee
         d-ee         d.ee         f4 fd/            Z	 	 d7d0ej        d1eej        eef         d2eej                 d3e	d4eeef         f
d5Z xZS )8UNet2DModela  
    A 2D UNet model that takes a noisy sample and a timestep and returns a sample shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample. Dimensions must be a multiple of `2 ** (len(block_out_channels) -
            1)`.
        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
        time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use.
        freq_shift (`int`, *optional*, defaults to 0): Frequency shift for Fourier time embedding.
        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
            Whether to flip sin to cos for Fourier time embedding.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`):
            Tuple of downsample block types.
        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2D"`):
            Block type for middle of UNet, it can be either `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`):
            Tuple of upsample block types.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(224, 448, 672, 896)`):
            Tuple of block output channels.
        layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block.
        mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block.
        downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution.
        downsample_type (`str`, *optional*, defaults to `conv`):
            The downsample type for downsampling layers. Choose between "conv" and "resnet"
        upsample_type (`str`, *optional*, defaults to `conv`):
            The upsample type for upsampling layers. Choose between "conv" and "resnet"
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
        attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
        norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization.
        attn_norm_num_groups (`int`, *optional*, defaults to `None`):
            If set to an integer, a group norm layer will be created in the mid block's [`Attention`] layer with the
            given number of groups. If left as `None`, the group norm layer will only be created if
            `resnet_time_scale_shift` is set to `default`, and if created will have `norm_num_groups` groups.
        norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for normalization.
        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
        class_embed_type (`str`, *optional*, defaults to `None`):
            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
            `"timestep"`, or `"identity"`.
        num_class_embeds (`int`, *optional*, defaults to `None`):
            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim` when performing class
            conditioning with `class_embed_type` equal to `None`.
    Nr   F
positionalr   TDownBlock2DAttnDownBlock2Dr&   r&   AttnUpBlock2Dr(   r(   	UpBlock2D   i  i  i  r   r   conv        silu       h㈵>defaultsample_sizein_channelsout_channelscenter_input_sampletime_embedding_type
freq_shiftflip_sin_to_cosdown_block_types.up_block_typesblock_out_channelslayers_per_blockmid_block_scale_factordownsample_paddingdownsample_typeupsample_typedropoutact_fnattention_head_dimnorm_num_groupsattn_norm_num_groupsnorm_epsresnet_time_scale_shiftadd_attentionclass_embed_typenum_class_embedsnum_train_timestepsc                    t                                                       || _        |
d         dz  }t          |          t          |	          k    rt	          d| d|	 d          t          |
          t          |          k    rt	          d|
 d| d          t          j        ||
d         dd	
          | _        |dk    r(t          |
d         d          | _	        d|
d         z  }nY|dk    r%t          |
d         ||          | _	        |
d         }n.|dk    r(t          j        ||
d                   | _	        |
d         }t          ||          | _        ||t          j        ||          | _        nD|dk    rt          ||          | _        n(|dk    rt          j        ||          | _        nd | _        t          j        g           | _        d | _        t          j        g           | _        |
d         }t)          |          D ]a\  }}|} |
|         }|t          |
          dz
  k    }!t+          ||| |||! |||||n|||||          }"| j                            |"           bt/          |
d         ||||||||n|
d         |||          | _        t1          t3          |
                    }#|#d         }t)          |	          D ]\  }}$|}%|#|         }|#t5          |dz   t          |
          dz
                     } |t          |
          dz
  k    }!t7          |$|dz   | ||%||! |||||n||||          }&| j                            |&           |}%||nt5          |
d         dz  d          }'t          j        |
d         |'|          | _        t          j                    | _        t          j        |
d         |dd
          | _         d S )Nr      z\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: .zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: r   )r   r   )kernel_sizepaddingfourier   )embedding_sizescaler   r#   learnedtimestepidentityr   )
num_layersr4   r5   temb_channelsadd_downsample
resnet_epsresnet_act_fnresnet_groupsrD   r?   rH   r@   rB   )r4   rZ   rB   r\   r]   output_scale_factorrH   rD   r^   attn_groupsrI   )rY   r4   r5   prev_output_channelrZ   add_upsampler\   r]   r^   rD   rH   rA   rB   r0   )num_channels
num_groupseps)!super__init__r3   len
ValueErrornnConv2dconv_inr   	time_projr   	Embeddingr   time_embeddingclass_embeddingIdentity
ModuleListdown_blocks	mid_block	up_blocks	enumerater   appendr   listreversedminr   	GroupNormconv_norm_outSiLUconv_actconv_out))selfr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   time_embed_dimtimestep_input_dimoutput_channelidown_block_typeinput_channelis_final_block
down_blockreversed_block_out_channelsup_block_typerb   up_blocknum_groups_out	__class__s)                                           r    rh   zUNet2DModel.__init__\   s   < 	&+A.2   C$7$777 go  g  g  Vd  g  g  g   !""c*:&;&;;; s  vH  s  s  `p  s  s  s  
 y.@.CQR\bccc )++6FXYZF[cefffDN!"%7%:!: L00&'9!'<ozZZDN!3A!6 I--\*=?QRS?TUUDN!3A!6/0BNSS #(8(D#%<0@.#Q#QD  ++#45G#X#XD  ++#%;~~#N#ND  #'D =,,r** ,A."+,<"="= 	0 	0A*M/2N#&8"9"9A"==N'+)+,#11#$-9K9W#5#5]k#5(? /  J  ##J//// (*2.(  6$;5G5S11YklnYo),'
 
 
 '+84F+G+G&H&H#4Q7 ). 9 9 	1 	1A}"08;N7AE3GYCZCZ]^C^8_8_`M#&8"9"9A"==N#+a/)+$7,!//#$-9K9W#5#5]k(?+  H  N!!(+++"0 -<,GSQcdeQfjkQkmoMpMp\7I!7LYgmuvvv			"4Q"7ST^_```r   r   rW   class_labelsreturn_dictreturnc           	      j   | j         j        rd|z  dz
  }|}t          j        |          s(t          j        |gt          j        |j                  }nLt          j        |          r8t          |j                  dk    r |d         	                    |j                  }|t          j
        |j        d         |j        |j                  z  }|                     |          }|	                    | j                  }|                     |          }| j        j|t          d          | j         j        dk    r|                     |          }|                     |          	                    | j                  }||z   }n| j        |t          d	          |}	|                     |          }|f}
| j        D ]:}t'          |d
          r ||||	          \  }}}	n |||          \  }}|
|z  }
;|                     ||          }d}	| j        D ]k}|
t          |j                   d         }|
dt          |j                            }
t'          |d
          r |||||	          \  }}	^ ||||          }l|                     |          }|                     |          }|                     |          }|	||	z  }| j         j        dk    rG|                    |j        d         gdgt          |j        dd                   z  R           }||z  }|s|fS t9          |          S )a  
        The [`UNet2DModel`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, channel, height, width)`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_2d.UNet2DOutput`] instead of a plain tuple.

        Returns:
            [`~models.unets.unet_2d.UNet2DOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unets.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
                returned where the first element is the sample tensor.
        r   g      ?)dtypedevicer   N)r   z=class_labels should be provided when doing class conditioningrW   zJclass_embedding needs to be initialized in order to use class conditioning	skip_conv)hidden_statestembskip_sample)r   r   rR   r   )r   )configr6   r   	is_tensortensorlongr   ri   shapetoonesr   rn   rp   rq   rj   rJ   rm   rt   hasattrru   rv   resnetsr}   r   r   r7   reshaper   )r   r   rW   r   r   	timestepst_embemb	class_embr   down_block_res_samplesdownsample_blockres_samplesupsample_blocks                 r    forwardzUNet2DModel.forward   s   2 ;* 	&Z#%F 	y)) 	:i[
6=YYYII_Y'' 	:C	,@,@A,E,E!$**6=99I 
6<?)/ZcZj k k kk	y))
 tz**!!%((+# !`aaa{+z99#~~l;;,,\::==DJ=OOI	/CC!)l.Fijjj f%% #) $ 0 	2 	2'55 W3C3C"(s4 4 40[[ '7&6VRU&V&V&V#"k1"" ,, "n 	B 	BN0#n6L2M2M1M1O1OPK%;<Zs>CY?Z?Z>Z<Z%["~{33 B&4nV[#{&[&[#'SAA ##F++v&&v&&"k!F;*i77!))6<?*[qcCUVUWUWHXDYDY>Y*[*[\\Ii'F 	96****r   )Nr   r   Fr#   r   Tr$   r'   r*   r   r   r   r,   r,   r-   r.   r/   r0   Nr1   r2   TNNN)NT)r   r   r   r   r	   r   r   intr   boolstrfloatrh   r   r   r   r   __classcell__)r   s   @r    r"   r"   (   s       1 1f  >B$)#/ $,t*j.B !()"#%#,-!.2'0"*.*.-17Ta TaeCsCx$89:Ta Ta 	Ta
 "Ta !Ta Ta Ta  S/Ta c3hTa "#s(OTa Ta !&Ta  Ta Ta  !Ta" #Ta$ %Ta& %SM'Ta( )Ta* 'sm+Ta, -Ta. "%/Ta0 1Ta2 #3-3Ta4 #3-5Ta6 &c]7Ta Ta Ta Ta Ta Tat 04 g+ g+g+ eS01g+ u|,	g+
 g+ 
|U"	#g+ g+ g+ g+ g+ g+ g+ g+r   r"   )dataclassesr   typingr   r   r   r   torch.nnrk   configuration_utilsr   r	   utilsr
   
embeddingsr   r   r   modeling_utilsr   unet_2d_blocksr   r   r   r   r"   r   r   r    <module>r      s@   " ! ! ! ! ! ) ) ) ) ) ) ) ) ) )        B B B B B B B B       P P P P P P P P P P ' ' ' ' ' ' H H H H H H H H H H 	 	 	 	 	: 	 	 	r+ r+ r+ r+ r+*k r+ r+ r+ r+ r+r   