
    wi}                        d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	mc m
Z ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZmZmZmZm Z   G d dej!                  Z" G d dej!                  Z#dej$        dej$        fdZ% G d dej!                  Z& G d dej!                  Z' G d dej!                  Z( G d dej!                  Z) G d dej!                  Z* G d dej!                  Z+dS )    )partial)OptionalTupleUnionN   )	deprecate   )get_activation)SpatialNorm)Downsample1DDownsample2DFirDownsample2DKDownsample2Ddownsample_2d)AdaGroupNorm)FirUpsample2DKUpsample2D
Upsample1D
Upsample2Dupfirdn2d_nativeupsample_2dc            "            e Zd ZdZddddddddd	d
dddddddedee         dededededee         dededededee         dedededee         f  fdZ	de
j        de
j        d e
j        fd!Z xZS )"ResnetBlockCondNorm2Da)  
    A Resnet block that use normalization layer that incorporate conditioning information.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
        groups_out (`int`, *optional*, default to None):
            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
        time_embedding_norm (`str`, *optional*, default to `"ada_group"` ):
            The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial".
        kernel (`torch.Tensor`, optional, default to None): FIR filter, see
            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
        use_in_shortcut (`bool`, *optional*, default to `True`):
            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
            `conv_shortcut` output.
        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
            If None, same as `out_channels`.
    NF               ư>swish	ada_group      ?T)out_channelsconv_shortcutdropouttemb_channelsgroups
groups_outepsnon_linearitytime_embedding_normoutput_scale_factoruse_in_shortcutupdownconv_shortcut_biasconv_2d_out_channelsin_channelsr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   c                :   t                                                       || _        ||n|}|| _        || _        || _        || _        || _        |
| _        ||}| j        dk    rt          ||||          | _
        n8| j        dk    rt          ||          | _
        nt          d| j                   t          j        ||ddd          | _        | j        dk    rt          ||||          | _        n8| j        dk    rt          ||          | _        nt          d| j                   t"          j                            |          | _        |p|}t          j        ||ddd          | _        t+          |	          | _        d x| _        | _        | j        rt3          |d	          | _        n| j        rt5          |ddd
          | _        || j        |k    n|| _        d | _        | j        r!t          j        ||ddd|          | _        d S d S )Nr   )r'   spatialz" unsupported time_embedding_norm:    r	   kernel_sizestridepaddingFuse_convopr9   r7   namer   r5   r6   r7   bias)super__init__r0   r!   use_conv_shortcutr,   r-   r*   r)   r   norm1r   
ValueErrornnConv2dconv1norm2torchDropoutr#   conv2r
   nonlinearityupsample
downsampler   r   r+   r"   )selfr0   r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   	__class__s                    g/root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/diffusers/models/resnet.pyr@   zResnetBlockCondNorm2D.__init__J   s^   ( 	&&2&:{{(!.	#6 #6 J#{22%m[&cRRRDJJ%22$[-@@DJJ\$BZ\\]]]Y{LaPQ[\]]]
#{22%m\:SVWWWDJJ%22$\=AADJJ\$BZ\\]]]x''003C|Y|-AqYZdefff
*=99*..7 	^&{UCCCDMMY 	^*;PQX\]]]DOKZKbt/3GGGhw! 	!#$'" " "D	 	    input_tensortembreturnc                 6   t          |          dk    s|                    dd           d}t          dd|           |}|                     ||          }|                     |          }| j        d|j        d         dk    r(|                                }|                                }|                     |          }|                     |          }n1| j        *|                     |          }|                     |          }| 	                    |          }| 
                    ||          }|                     |          }|                     |          }|                     |          }| j        |                     |          }||z   | j        z  }|S )Nr   scaleThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.1.0.0@   )lengetr   rB   rK   rL   shape
contiguousrM   rF   rG   r#   rJ   r"   r*   )rN   rR   rS   argskwargsdeprecation_messagehidden_statesoutput_tensors           rP   forwardzResnetBlockCondNorm2D.forward   s   t99q==FJJw55A #Ugw(;<<<$

=$77))-88=$"1%+++6688 - 8 8 : :==66L MM-88MM_(??<88L OOM::M

=11

=$77))-88]33

=11)--l;;L%59QQrQ   )__name__
__module____qualname____doc__intr   boolfloatstrr@   rH   Tensorrc   __classcell__rO   s   @rP   r   r   ,   s{        B '+# $($#.%(*.#'.2%I I I I sm	I
 I I I I SMI I I !I #I "$I I  !I" !#I$ 'sm%I I I I I IV%EL % %Z_Zf % % % % % % % %rQ   r   c            (           e Zd ZdZddddddddd	dd
dddddddddedee         dededededee         dedededededee	j
                 dedee         dedededee         f& fd Zd!e	j
        d"e	j
        d#e	j
        fd$Z xZS )%ResnetBlock2Da9  
    A Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
        groups_out (`int`, *optional*, default to None):
            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a
            stronger conditioning with scale and shift.
        kernel (`torch.Tensor`, optional, default to None): FIR filter, see
            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
        use_in_shortcut (`bool`, *optional*, default to `True`):
            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
            `conv_shortcut` output.
        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
            If None, same as `out_channels`.
    NFr   r   r   Tr   r   defaultr    )r!   r"   r#   r$   r%   r&   pre_normr'   r(   skip_time_actr)   kernelr*   r+   r,   r-   r.   r/   r0   r!   r"   r#   r$   r%   r&   rr   r'   r(   rs   r)   rt   r*   r+   r,   r-   r.   r/   c                   t                                                       |dk    rt          d          |dk    rt          d          d| _        || _        ||n|}|| _        || _        || _        || _        || _	        || _
        || _        ||}t          j                            |||	d          | _        t          j        ||ddd	          | _        |g| j
        d
k    rt          j        ||          | _        nH| j
        dk    rt          j        |d|z            | _        nt          d| j
         d          d | _        t          j                            |||	d          | _        t          j                            |          | _        |p|}t          j        ||ddd	          | _        t1          |
          | _        d x| _        | _        | j        rR|dk    rdfd| _        n|dk    r"t9          t:          j        dd          | _        nqt?          |d          | _        nZ| j        rS|dk    rdfd| _        n@|dk    r"t9          t:          j         dd          | _        ntC          |ddd          | _        || j        |k    n|| _"        d | _#        | j"        r!t          j        ||ddd|          | _#        d S d S )Nr   zkThis class cannot be used with `time_embedding_norm==ada_group`, please use `ResnetBlockCondNorm2D` insteadr2   ziThis class cannot be used with `time_embedding_norm==spatial`, please use `ResnetBlockCondNorm2D` insteadT
num_groupsnum_channelsr'   affiner3   r	   r4   rq   scale_shiftr   zunknown time_embedding_norm :  fir)r	   r3   r3   r	   c                 &    t          |           S N)rt   )r   x
fir_kernels    rP   <lambda>z(ResnetBlock2D.__init__.<locals>.<lambda>%  s    +a
*K*K*K rQ   sde_vpg       @nearest)scale_factormodeFr8   c                 &    t          |           S r~   )r   r   s    rP   r   z(ResnetBlock2D.__init__.<locals>.<lambda>-  s    M!J,O,O,O rQ   )r5   r6   r:   r;   r   r=   )$r?   r@   rC   rr   r0   r!   rA   r,   r-   r*   r)   rs   rH   rD   	GroupNormrB   rE   rF   Lineartime_emb_projrG   rI   r#   rJ   r
   rK   rL   rM   r   Finterpolater   
avg_pool2dr   r+   r"   )rN   r0   r!   r"   r#   r$   r%   r&   rr   r'   r(   rs   r)   rt   r*   r+   r,   r-   r.   r/   r   rO   s                       @rP   r@   zResnetBlock2D.__init__   s3   . 	+--}   )++{   &&2&:{{(!.	#6 #6 *JX''6Y\ei'jj
Y{LaPQ[\]]]
$'944%'Y}l%K%K"")]::%'Y}a,>N%O%O"" !]$BZ!]!]!]^^^!%DX'':L^ajn'oo
x''003C|Y|-AqYZdefff
*=99*..7 	b)
 K K K K8## 'Ci X X X *; G G GY 	b)
"O"O"O"O8##")!,Aa"P"P"P".{UTU\`"a"a"aKZKbt/3GGGhw! 	!#$'" " "D	 	rQ   rR   rS   rT   c                    t          |          dk    s|                    dd           d}t          dd|           |}|                     |          }|                     |          }| j        d|j        d         dk    r(|                                }|                                }|                     |          }|                     |          }n1| j        *|                     |          }|                     |          }| 	                    |          }| j
        ?| j        s|                     |          }| 
                    |          d d d d d d f         }| j        dk    r|||z   }|                     |          }nt| j        dk    rT|t          d| j                   t          j        |d	d
          \  }}|                     |          }|d
|z   z  |z   }n|                     |          }|                     |          }|                     |          }|                     |          }| j        |                     |          }||z   | j        z  }	|	S )Nr   rV   rW   rX   rY   rq   rz   z9 `temb` should not be None when `time_embedding_norm` is r   r	   )dim)rZ   r[   r   rB   rK   rL   r\   r]   rM   rF   r   rs   r)   rG   rC   rH   chunkr#   rJ   r"   r*   )
rN   rR   rS   r^   r_   r`   ra   
time_scale
time_shiftrb   s
             rP   rc   zResnetBlock2D.forward@  sw   t99q==FJJw55A #Ugw(;<<<$

=11))-88=$"1%+++6688 - 8 8 : :==66L MM-88MM_(??<88L OOM::M

=11)% /((..%%d++AAAqqq$,<=D#y00 - 4 JJ}55MM%66| jPTPhjj   &+[qa%@%@%@"J
 JJ}55M)Q^<zIMM JJ}55M))-88]33

=11)--l;;L%59QQrQ   )rd   re   rf   rg   rh   r   ri   rj   rk   rH   rl   r@   rc   rm   rn   s   @rP   rp   rp      s        D '+# $($##,)-%(*.#'.2+b b b b sm	b
 b b b b SMb b b b b !b &b  #!b" "$#b$ %b& 'b( !)b* 'sm+b b b b b bH5EL 5 5Z_Zf 5 5 5 5 5 5 5 5rQ   rp   tensorrT   c                 8   t          | j                  dk    r| d d d d d f         S t          | j                  dk    r| d d d d d d d f         S t          | j                  dk    r| d d d d dd d f         S t          dt          |            d          )Nr   r3      r   z`len(tensor)`: z has to be 2, 3 or 4.)rZ   r\   rC   )r   s    rP   rearrange_dimsr   y  s    
6<AaaaDj!!
6<AaaaD!!!m$$	V\		a		aaaAqqqj!!M3v;;MMMNNNrQ   c                        e Zd ZdZ	 	 ddededeeeeef         f         dedef
 fd	Zd
e	j
        de	j
        fdZ xZS )Conv1dBlocka  
    Conv1d --> GroupNorm --> Mish

    Parameters:
        inp_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        kernel_size (`int` or `tuple`): Size of the convolving kernel.
        n_groups (`int`, default `8`): Number of groups to separate the channels into.
        activation (`str`, defaults to `mish`): Name of the activation function.
       mishinp_channelsr!   r5   n_groups
activationc                     t                                                       t          j        ||||dz            | _        t          j        ||          | _        t          |          | _        d S )Nr   r7   )	r?   r@   rD   Conv1dconv1dr   
group_normr
   r   )rN   r   r!   r5   r   r   rO   s         rP   r@   zConv1dBlock.__init__  sb     	ilKQ\`aQabbb,x>>":..			rQ   inputsrT   c                     |                      |          }t          |          }|                     |          }t          |          }|                     |          }|S N)r   r   r   r   )rN   r   intermediate_reproutputs       rP   rc   zConv1dBlock.forward  s\     KK//*+<== OO,=>>*+<==,--rQ   )r   r   rd   re   rf   rg   rh   r   r   rk   r@   rH   rl   rc   rm   rn   s   @rP   r   r     s        	 	   / // / 3c3h/0	/
 / / / / / / /el u|        rQ   r   c                        e Zd ZdZ	 	 ddedededeeeeef         f         def
 fd	Zd
e	j
        de	j
        de	j
        fdZ xZS )ResidualTemporalBlock1Da  
    Residual 1D block with temporal convolutions.

    Parameters:
        inp_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        embed_dim (`int`): Embedding dimension.
        kernel_size (`int` or `tuple`): Size of the convolving kernel.
        activation (`str`, defaults `mish`): It is possible to choose the right activation function.
       r   r   r!   	embed_dimr5   r   c                 d   t                                                       t          |||          | _        t          |||          | _        t          |          | _        t          j        ||          | _	        ||k    rt          j
        ||d          nt          j                    | _        d S )Nr	   )r?   r@   r   conv_inconv_outr
   time_emb_actrD   r   time_embr   Identityresidual_conv)rN   r   r!   r   r5   r   rO   s         rP   r@   z ResidualTemporalBlock1D.__init__  s     	"<{KK#L,LL*:66	)\:: 9E8T8TBIlL!444Z\ZeZgZg 	rQ   r   trT   c                     |                      |          }|                     |          }|                     |          t          |          z   }|                     |          }||                     |          z   S )z
        Args:
            inputs : [ batch_size x inp_channels x horizon ]
            t : [ batch_size x embed_dim ]

        returns:
            out : [ batch_size x out_channels x horizon ]
        )r   r   r   r   r   r   )rN   r   r   outs       rP   rc   zResidualTemporalBlock1D.forward  sp     a  MM!ll6""^A%6%66mmC  T''////rQ   )r   r   r   rn   s   @rP   r   r     s        	 	  45 
 

 
 	

 3c3h/0
 
 
 
 
 
 
&0el 0u| 0 0 0 0 0 0 0 0 0rQ   r   c            	       r     e Zd ZdZ	 	 	 ddedee         dedef fd	Zddej	        dedej	        fdZ
 xZS )TemporalConvLayera  
    Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016

    Parameters:
        in_dim (`int`): Number of input channels.
        out_dim (`int`): Number of output channels.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
    Nr   r   in_dimout_dimr#   norm_num_groupsc                 B   t                                                       |p|}|| _        || _        t	          j        t	          j        ||          t	          j                    t	          j        ||dd                    | _	        t	          j        t	          j        ||          t	          j                    t	          j
        |          t	          j        ||dd                    | _        t	          j        t	          j        ||          t	          j                    t	          j
        |          t	          j        ||dd                    | _        t	          j        t	          j        ||          t	          j                    t	          j
        |          t	          j        ||dd                    | _        t          j                            | j        d         j                   t          j                            | j        d         j                   d S )Nr3   r	   r	   )r	   r   r   r   )r?   r@   r   r   rD   
Sequentialr   SiLUConv3drF   rI   rJ   conv3conv4initzeros_weightr>   )rN   r   r   r#   r   rO   s        rP   r@   zTemporalConvLayer.__init__  s    	#V ]L&11GIIIfgy)DDD
 


 ]L'22GIIJwIgvy)DDD	
 

 ]L'22GIIJwIgvy)DDD	
 

 ]L'22GIIJwIgvy)DDD	
 

 	tz"~,---
tz"~*+++++rQ   r	   ra   
num_framesrT   c                    |d d d f                              d|f|j        dd          z                                 ddddd          }|}|                     |          }|                     |          }|                     |          }|                     |          }||z   }|                    ddddd                               |j        d         |j        d         z  df|j        dd          z             }|S )Nr   r	   r   r   r3   r   )reshaper\   permuterF   rJ   r   r   )rN   ra   r   identitys       rP   rc   zTemporalConvLayer.forward  s   $'"**B
+;m>QRSRTRT>U+UVV^^_`bcefhiklmm 	 !

=11

=11

=11

=11 =0%--aAq!<<DD #m&9!&<<bAMDWXYXZXZD[[
 
 rQ   )Nr   r   )r	   rd   re   rf   rg   rh   r   rj   r@   rH   rl   rc   rm   rn   s   @rP   r   r     s          "&!', ',', #', 	',
 ', ', ', ', ', ',R U\ s 5<        rQ   r   c            	       z     e Zd ZdZ	 	 	 ddedee         dedef fd	Zd
ej	        dej	        dej	        fdZ
 xZS )TemporalResnetBlocka  
    A Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
    Nr   r   r0   r!   r$   r'   c                    t                                                       || _        ||n|}|| _        d}d |D             }t          j                            d||d          | _        t          j        |||d|          | _	        |t          j
        ||          | _        nd | _        t          j                            d||d          | _        t          j                            d          | _        t          j        |||d|          | _        t!          d	          | _        | j        |k    | _        d | _        | j        r t          j        ||ddd
          | _        d S d S )Nr   c                     g | ]}|d z  S )r    ).0ks     rP   
<listcomp>z0TemporalResnetBlock.__init__.<locals>.<listcomp>7  s    ///a16///rQ   r   Trv   r	   r4   r   silur   )r?   r@   r0   r!   rH   rD   r   rB   r   rF   r   r   rG   rI   r#   rJ   r
   rK   r+   r"   )rN   r0   r!   r$   r'   r5   r7   rO   s          rP   r@   zTemporalResnetBlock.__init__*  s    	&&2&:{{(//;///X''2KUXae'ff
Y#
 
 

 $!#=,!G!GD!%DX''2LVYbf'gg
x'',,Y#
 
 

 +622#/<?! 	!#" " "D	 	rQ   rR   rS   rT   c                 2   |}|                      |          }|                     |          }|                     |          }| j        Y|                     |          }|                     |          d d d d d d d d f         }|                    ddddd          }||z   }|                     |          }|                     |          }|                     |          }|                     |          }| j        |                     |          }||z   }|S )Nr   r   r	   r3   r   )	rB   rK   rF   r   r   rG   r#   rJ   r"   )rN   rR   rS   ra   rb   s        rP   rc   zTemporalResnetBlock.forward`  s   $

=11))-88

=11)$$T**D%%d++AAAqqq!!!T4,?@D<<1aA..D)D0M

=11))-88]33

=11)--l;;L$}4rQ   )Nr   r   r   rn   s   @rP   r   r     s        	 	 '+ 4 44 sm4 	4
 4 4 4 4 4 4lEL          rQ   r   c                        e Zd ZdZ	 	 	 	 	 	 	 dded	ee         d
ededee         dedef fdZ	 	 dde	j
        dee	j
                 dee	j
                 fdZ xZS )SpatioTemporalResBlocka  
    A SpatioTemporal Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
        temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
        merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
            The merge strategy to use for the temporal mixing.
        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
            If `True`, switch the spatial and temporal mixing.
    Nr   r         ?learned_with_imagesFr0   r!   r$   r'   temporal_epsmerge_factorswitch_spatial_to_temporal_mixc	                     t                                                       t          ||||          | _        t	          ||n|||n||||n|          | _        t          |||          | _        d S )N)r0   r!   r$   r'   )alphamerge_strategyr   )r?   r@   rp   spatial_res_blockr   temporal_res_blockAlphaBlender
time_mixer)
rN   r0   r!   r$   r'   r   r   r   r   rO   s
            rP   r@   zSpatioTemporalResBlock.__init__  s     	!.#%'	"
 "
 "
 #6(4(@k)5)A{' , 8c	#
 #
 #
 ')+I
 
 
rQ   ra   rS   image_only_indicatorc                 B   |j         d         }|                     ||          }|j         \  }}}}||z  }	|d d d f                             |	||||                              ddddd          }
|d d d f                             |	||||                              ddddd          }||                    |	|d          }|                     ||          }|                     |
||          }|                    ddddd                              ||||          }|S )Nr   r   r   r	   r3   r   )	x_spatial
x_temporalr   )r\   r   r   r   r   r   )rN   ra   rS   r   r   batch_frameschannelsheightwidth
batch_sizehidden_states_mixs              rP   rc   zSpatioTemporalResBlock.forward  s`    */3
..}dCC0=0C-h!Z/
 $'"**:z8VUZ[[ccdeghjkmnpqrr 	 $'"**:z8VUZ[[ccdeghjkmnpqrr 	 <<
J;;D//tDD'$!5 ( 
 
 &--aAq!<<DD\S[]cejkkrQ   )Nr   r   Nr   r   F)NN)rd   re   rf   rg   rh   r   rj   ri   r@   rH   rl   rc   rm   rn   s   @rP   r   r   {  s         ( '+ (,!,/4
 

 sm
 	

 
 uo
 
 )-
 
 
 
 
 
H (,7;	 | u|$ 'u|4	       rQ   r   c            	            e Zd ZdZg dZ	 	 ddededef fdZd	e	j
        d
ede	j
        fdZ	 dde	j
        de	j
        d	ee	j
                 de	j
        fdZ xZS )r   a  
    A module to blend spatial and temporal features.

    Parameters:
        alpha (`float`): The initial value of the blending factor.
        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
            The merge strategy to use for the temporal mixing.
        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
            If `True`, switch the spatial and temporal mixing.
    )learnedfixedr   r   Fr   r   r   c                    t                                                       || _        || _        || j        vrt          d| j                   | j        dk    r+|                     dt          j        |g                     d S | j        dk    s| j        dk    rH| 	                    dt          j
                            t          j        |g                               d S t          d| j                   )Nzmerge_strategy needs to be in r   
mix_factorr   r   zUnknown merge strategy )r?   r@   r   r   
strategiesrC   register_bufferrH   rl   register_parameterrD   	Parameter)rN   r   r   r   rO   s       rP   r@   zAlphaBlender.__init__  s     	,.L+00OdoOOPPP'))  u|UG/D/DEEEEE I--1DH]1]1]##L%(2D2DU\SXRYEZEZ2[2[\\\\\Lt7JLLMMMrQ   r   ndimsrT   c                    | j         dk    r| j        }n| j         dk    rt          j        | j                  }n| j         dk    r|t	          d          t          j        |                                t          j        dd|j                  t          j        | j                  d                   }|dk    r|d d d d d d d f         }nA|d	k    r!|	                    d
          d d d d f         }nt	          d| d          t          |S )Nr   r   r   zMPlease provide image_only_indicator to use learned_with_images merge strategyr	   )device).Nr   r3   r   zUnexpected ndims z. Dimensions should be 3 or 5)r   r   rH   sigmoidrC   whereri   onesr   r   NotImplementedError)rN   r   r   r   s       rP   	get_alphazAlphaBlender.get_alpha  s&   '))OEE I--M$/22EE $999#+ !pqqqK$))++
1a(<(CDDDdo..y9 E zzaaaqqq$45!b))!!!T4-8 !YU!Y!Y!YZZZ &%rQ   Nr   r   c                     |                      ||j                  }|                    |j                  }| j        rd|z
  }||z  d|z
  |z  z   }|S )Nr    )r   ndimtodtyper   )rN   r   r   r   r   r   s         rP   rc   zAlphaBlender.forward  s^     3Y^DD)). 	 %KEIu
 ::rQ   )r   Fr   )rd   re   rf   rg   r   rj   rk   ri   r@   rH   rl   rh   r   r   rc   rm   rn   s   @rP   r   r     s       	 	 =<<J
 4/4	N NN N )-	N N N N N N(el 3 5<    F 8<	 < L 'u|4	
 
       rQ   r   ),	functoolsr   typingr   r   r   rH   torch.nnrD   torch.nn.functional
functionalr   utilsr   activationsr
   attention_processorr   downsamplingr   r   r   r   r   normalizationr   
upsamplingr   r   r   r   r   r   Moduler   rp   rl   r   r   r   r   r   r   r   r   rQ   rP   <module>r     s          ) ) ) ) ) ) ) ) ) )                       ' ' ' ' ' ' , , , , , ,              ( ' ' ' ' '               N N N N NBI N N Nbx x x x xBI x x xxO5< OEL O O O O         ")      H,0 ,0 ,0 ,0 ,0bi ,0 ,0 ,0^D D D D D	 D D DNY Y Y Y Y") Y Y YzQ Q Q Q QRY Q Q QhN N N N N29 N N N N NrQ   