
    wi              
       h   d dl Z d dlmZmZmZmZmZmZ d dlZd dl	m
Z
mZmZmZ ddlmZ ddlmZmZmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5  e'j6        e7          Z8dZ9 G d de2e3eeee0          Z:dS )    N)AnyCallableDictListOptionalUnion)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInput)IPAdapterMixinLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionUNet2DConditionModelUNetMotionModel)adjust_lora_scale_text_encoder)MotionAdapter)DDIMSchedulerDPMSolverMultistepSchedulerEulerAncestralDiscreteSchedulerEulerDiscreteSchedulerLMSDiscreteSchedulerPNDMScheduler)USE_PEFT_BACKEND	deprecateloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)randn_tensor)VideoProcessor   )FreeInitMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
        >>> from diffusers.utils import export_to_gif

        >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
        >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
        >>> output = pipe(prompt="A corgi walking in the park")
        >>> frames = output.frames[0]
        >>> export_to_gif(frames, "animation.gif")
        ```
c            /           e Zd ZdZdZg dZg dZ	 	 d9dedede	d	e
eef         d
ede
eeeeeef         dedef fdZ	 	 	 	 	 d:deej                 deej                 dee         dee         fdZd;dZd Zd Zd Z 	 	 	 	 	 	 d<dZ!	 d;dZ"e#d             Z$e#d             Z%e#d             Z&e#d             Z'e#d             Z( ej)                     e*e+          ddddd d!dd"d#ddddddd$d%dddd&gfd'e
e,e-e,         f         d(ee         d)ee         d*ee         d+ed,ed-ee
e,e-e,         f                  d.ee         d/ed0ee
ej.        e-ej.                 f                  d&eej                 deej                 deej                 d1ee/         d2ee-ej                          d3ee,         d4e0d5ee1e,e2f                  dee         d6ee3eee1gdf                  d7e-e,         f*d8                        Z4 xZ5S )=AnimateDiffPipelineaX  
    Pipeline for text-to-video generation.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        tokenizer (`CLIPTokenizer`):
            A [`~transformers.CLIPTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
        motion_adapter ([`MotionAdapter`]):
            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    z&text_encoder->image_encoder->unet->vae)feature_extractorimage_encodermotion_adapter)latentsprompt_embedsnegative_prompt_embedsNvaetext_encoder	tokenizerunetr0   	schedulerr.   r/   c	           
      ^   t                                                       t          |t                    rt	          j        ||          }|                     ||||||||           dt          | j        j	        j
                  dz
  z  | _        t          d| j                  | _        d S )N)r4   r5   r6   r7   r0   r8   r.   r/   r&   r*   F)	do_resizevae_scale_factor)super__init__
isinstancer   r   from_unet2dregister_moduleslenr4   configblock_out_channelsr;   r%   video_processor)
selfr4   r5   r6   r7   r0   r8   r.   r/   	__class__s
            /root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff.pyr=   zAnimateDiffPipeline.__init__l   s    $ 	d011 	E".t^DDD%)/' 	 		
 		
 		
 !"c$(/*L&M&MPQ&Q R-PTPefff    r2   r3   
lora_scale	clip_skipc
                 D
   |Nt          | t                    r9|| _        t          st	          | j        |           nt          | j        |           |t          |t                    rd}
n4|%t          |t                    rt          |          }
n|j
        d         }
|t          | t                    r|                     || j                  }|                     |d| j        j        dd          }|j        }|                     |dd	          j        }|j
        d
         |j
        d
         k    rrt!          j        ||          s]| j                            |dd| j        j        dz
  d
f                   }t&                              d| j        j         d|            t+          | j        j        d          r,| j        j        j        r|j                            |          }nd}|	3|                     |                    |          |          }|d         }n\|                     |                    |          |d          }|d
         |	dz             }| j        j                            |          }| j        | j        j        }n| j        | j        j        }n|j        }|                    ||          }|j
        \  }}}|                    d|d          }|                    ||z  |d
          }|r||dg|
z  }n|NtA          |          tA          |          ur0tC          dtA          |           dtA          |           d          t          |t                    r|g}n>|
t          |          k    r)tE          d| dt          |           d| d|
 d	          |}t          | t                    r|                     || j                  }|j
        d         }|                     |d|dd          }t+          | j        j        d          r,| j        j        j        r|j                            |          }nd}|                     |j                            |          |          }|d         }|rU|j
        d         }|                    ||          }|                    d|d          }|                    |
|z  |d
          }| j        1t          | t                    rt          rtG          | j        |           ||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr*   r   
max_lengthTpt)paddingrL   
truncationreturn_tensorslongest)rN   rP   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)rT   output_hidden_states)dtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)$r>   r   _lora_scaler   r   r5   r"   strlistrA   shaper   maybe_convert_promptr6   model_max_length	input_idstorchequalbatch_decodeloggerwarninghasattrrB   rS   rT   to
text_modelfinal_layer_normrV   r7   repeatviewtype	TypeError
ValueErrorr#   )rE   promptrW   num_images_per_promptdo_classifier_free_guidancenegative_promptr2   r3   rI   rJ   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrT   prompt_embeds_dtypebs_embedseq_len_uncond_tokensrL   uncond_inputs                          rG   encode_promptz!AnimateDiffPipeline.encode_prompt   s   V !j&G&G!)D $ A.t/@*MMMM!$"3Z@@@*VS"9"9JJJvt$<$<VJJ&,Q/J $ ;<< K2264>JJ..$>:# )  K )2N"nnVYW[n\\fO$R(N,@,DDDU[N ND  $~::#AAAt~'F'JR'O$OP    Q7Q QBNQ Q  
 t(/1EFF &4K\KcKv &!,!;!>!>v!F!F!%  $ 1 1.2C2CF2K2K\j 1 k k -a 0 $ 1 1"%%f--ncg !2 ! ! !.b 1IM2B C
 !% 1 < M Mm \ \("&"3"9Y""&)/"/"5%((/B6(RR,2'1%,,Q0EqII%**86K+KWVXYY ' *	?+A+I&!#z 1#VD<Q<Q(Q(Q(VZ[jVkVk ( (V( ( (   OS11 	0!0 1s?3333 3/ 3 33K_K_ 3 33 30:3 3 3   !0 $ ;<< Y $ 9 9- X X&,Q/J>>$%# *  L t(/1EFF &4K\KcKv &!-!<!?!?!G!G!%%)%6%6&))&11- &7 & &" &<A%>"& 	r,215G%;%>%>EXag%>%h%h"%;%B%B1F[]^%_%_"%;%@%@NcAcelnp%q%q"($00 C5E C#D$5zBBB444rH   c                    t          | j                                                  j        }t	          |t
          j                  s|                     |d          j        }|	                    ||          }|r|                     |d          j
        d         }|                    |d          }|                     t          j        |          d          j
        d         }|                    |d          }||fS |                     |          j        }|                    |d          }t          j        |          }	||	fS )	NrM   )rP   )rW   rV   T)rU   r   dim)nextr/   
parametersrV   r>   ra   Tensorr.   pixel_valuesrg   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
rE   imagerW   rp   rU   rV   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedss
             rG   encode_imagez AnimateDiffPipeline.encode_imageG  sa   T'224455;%.. 	T**5*FFSEe44 	5&*&8&8UY&8&Z&Z&hik&l#&=&O&OPekl&O&m&m#-1-?-? ''d .@ . .B. * .L-]-]%1 .^ . .* +,JJJ--e44AL'99:OUV9WWL"'"2<"@"@!444rH   c           
      X   |Ut          |t                    s|g}t          |          t          | j        j        j                  k    r?t          dt          |           dt          | j        j        j                   d          g }t          || j        j        j                  D ]\  }}t          |t                     }	| 	                    ||d|	          \  }
}t          j        |
g|z  d          }
t          j        |g|z  d          }|r+t          j        ||
g          }
|
                    |          }
|                    |
           ndg}g }|D ]}
|r|
                    d          \  }}
 |
j        |g|t          |
j        dd                    z  R  }
 |j        |g|t          |j        dd                    z  R  }t          j        ||
g          }
n+ |
j        |g|t          |
j        dd                    z  R  }
|                    |
           |S )NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r*   r   r   r&   )r>   r\   rA   r7   encoder_hid_projimage_projection_layersrn   zipr   r   ra   stackcatrg   appendchunkrj   r]   )rE   ip_adapter_imageip_adapter_image_embedsrW   rp   rq   r   single_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsrepeat_dimss                rG   prepare_ip_adapter_image_embedsz3AnimateDiffPipeline.prepare_ip_adapter_image_embeds`  s    #*.55 6$4#5 #$$DI,F,^(_(___  Lbefvbwbw  L  L  FI  JN  JS  Jd  J|  F}  F}  L  L  L   L=@ $)"<"T> > 9 99')9 +55E*W*W&W#DHDUDU+VQ8KE EA#%A ',k3F2GJ_2_ef&g&g&g#/4{125JJPQ0 0 0, / I*/)5QSf4g*h*h'*=*@*@*H*H'##$78888!9$ #KL'> 9 9#. H[HaHabcHdHdE02E*D*=*D-+1<sCVC\]^]_]_C`?a?a1a+ + +' 4W3O3V-41<sC_CefgfhfhCi?j?j1j4 4 40 +0)5QSf4g*h*h''*D*=*D-+1<sCVC\]^]_]_C`?a?a1a+ + +' ##$78888rH   c                    d| j         j        j        z  |z  }|j        \  }}}}}|                    ddddd                              ||z  |||          }| j                             |          j        }|d d d f                             ||df|j        dd          z                                 ddddd          }|                                }|S )Nr*   r   r&   r      rR   )	r4   rB   scaling_factorr]   permutereshapedecodesamplefloat)	rE   r1   rs   channels
num_framesheightwidthr   videos	            rG   decode_latentsz"AnimateDiffPipeline.decode_latents  s    dho44w>:A-7
Hj&%//!Q1a0088j9PRZ\bdijj((/dAAAg&&
J'CekRSRTRTo'UVV^^_`bcefhiklmmrH   c                 6   dt          t          j        | j        j                  j                                                  v }i }|r||d<   dt          t          j        | j        j                  j                                                  v }|r||d<   |S )Neta	generator)setinspect	signaturer8   stepr   keys)rE   r   r   accepts_etaextra_step_kwargsaccepts_generators         rG   prepare_extra_step_kwargsz-AnimateDiffPipeline.prepare_extra_step_kwargs  s     s7#4T^5H#I#I#T#Y#Y#[#[\\\ 	+'*e$ (3w/@AT/U/U/`/e/e/g/g+h+hh 	7-6k*  rH   c                     |dz  dk    s	|dz  dk    rt          d| d| d          |>t          |t                    r|dk    r#t          d| dt          |           d          |
At	           fd|
D                       s&t          d	 j         d
 fd|
D                        ||t          d| d| d          ||t          d          |It          |t                    s4t          |t                    st          dt          |                     ||t          d| d| d          |2|0|j        |j        k    r t          d|j         d|j         d          ||	t          d          |	at          |	t                    st          dt          |	                     |	d         j	        dvr t          d|	d         j	         d          d S d S )N   r   z7`height` and `width` have to be divisible by 8 but are z and rY   z5`callback_steps` has to be a positive integer but is z	 of type c              3   *   K   | ]}|j         v V  d S N_callback_tensor_inputs.0krE   s     rG   	<genexpr>z3AnimateDiffPipeline.check_inputs.<locals>.<genexpr>  sD       F
 F
23A--F
 F
 F
 F
 F
 F
rH   z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                 &    g | ]}|j         v|S  r   r   s     rG   
<listcomp>z4AnimateDiffPipeline.check_inputs.<locals>.<listcomp>  sV      pH  pH  pHvw  bc  ko  kG  bG  bGpq  bG  bG  bGrH   zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zProvide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined.z:`ip_adapter_image_embeds` has to be of type `list` but is )r   r   zF`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is D)
rn   r>   intrl   allr   r[   r\   r]   ndim)rE   ro   r   r   callback_stepsrr   r2   r3   r   r   "callback_on_step_end_tensor_inputss   `          rG   check_inputsz AnimateDiffPipeline.check_inputs  sZ    A:??eai1nnlW]lldilllmmm%z.#/N/N%R`deReRe, , ,((, , ,   .9# F
 F
 F
 F
7YF
 F
 F
 C
 C
9  JTEa  J  J  pH  pH  pH  pH  |^  pH  pH  pH  J  J   -";0 0 0} 0 0 0   ^ 5w   FC)@)@TZ\`IaIa`RVW]R^R^``aaa&+A+M_/ _ _*_ _ _  
 $)?)K"&<&BBB 8-:-@8 8.48 8 8   ',C,O ^   #.5t<<  pQUVmQnQnpp   )+0>> ]tuv]w]|   /.
 ?>rH   c
                 >   ||||| j         z  || j         z  f}
t          |t                    r6t          |          |k    r#t	          dt          |           d| d          |	t          |
|||          }	n|	                    |          }	|	| j        j        z  }	|	S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r   rW   rV   )	r;   r>   r\   rA   rn   r$   rg   r8   init_noise_sigma)rE   rs   num_channels_latentsr   r   r   rV   rW   r   r1   r]   s              rG   prepare_latentsz#AnimateDiffPipeline.prepare_latents  s      d++T**
 i&& 	3y>>Z+G+Gi#i.. i i&i i i  
 ?"5IfTYZZZGGjj((G DN;;rH   c                     | j         S r   _guidance_scalerE   s    rG   guidance_scalez"AnimateDiffPipeline.guidance_scale  s    ##rH   c                     | j         S r   )
_clip_skipr   s    rG   rJ   zAnimateDiffPipeline.clip_skip  s
    rH   c                     | j         dk    S )Nr*   r   r   s    rG   rq   z/AnimateDiffPipeline.do_classifier_free_guidance  s    #a''rH   c                     | j         S r   )_cross_attention_kwargsr   s    rG   cross_attention_kwargsz*AnimateDiffPipeline.cross_attention_kwargs!  s    ++rH   c                     | j         S r   )_num_timestepsr   s    rG   num_timestepsz!AnimateDiffPipeline.num_timesteps%  s    ""rH      2   g      @r*   g        pilTr1   ro   r   r   r   num_inference_stepsr   rr   num_videos_per_promptr   r   r   r   output_typereturn_dictr   callback_on_step_endr   c                 	   |                     dd          }|                     dd          }|t          ddd           |t          ddd           |p| j        j        j        | j        z  }|p| j        j        j        | j        z  }d}|                     ||||||||||
  
         || _        || _        || _	        |t          |t                    rd}n4|%t          |t                    rt          |          }n|j        d         }| j        }| j        | j                            d	d          nd}|                     |||| j        ||||| j        
	  	        \  }}| j        rt+          j        ||g          }||!|                     |||||z  | j                  }| j                            ||           | j        j        }| j        j        j        }|                     ||z  |||||j        ||
|	  	        }|                     |
|	          }||d|ind} | j        r| j         nd}!tC          |!          D ]}"| j        r"| "                    ||"|||j        |
          \  }}t          |          | _#        t          |          || j        j$        z  z
  }#| %                    | j#                  5 }$tM          |          D ]v\  }%}&| j        rt+          j        |gdz            n|}'| j        '                    |'|&          }'|                     |'|&|||           j(        }(| j        r#|()                    d          \  })}*|)||*|)z
  z  z   }( | j        j*        |(|&|fi |j+        }|ni }+|D ]},tY                      |,         |+|,<    || |%|&|+          }-|-                     d|          }|-                     d|          }|-                     d|          }|%t          |          dz
  k    s|%dz   |#k    rB|%dz   | j        j$        z  dk    r,|$-                                 ||%|z  dk    r ||%|&|           x	 ddd           n# 1 swxY w Y   |dk    r|}.n1| .                    |          }/| j/        0                    |/|          }.| 1                                 |s|.fS te          |.          S )uG  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated video.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated video.
            num_frames (`int`, *optional*, defaults to 16):
                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
                amounts to 2 seconds of video.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
                `(batch_size, num_channel, num_frames, height, width)`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
                of a plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
        callbackNr   z1.0.0zjPassing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`zpPassing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`r*   r   scale)r2   r3   rI   rJ   )rW   r   )totalr&   )encoder_hidden_statesr   added_cond_kwargsr1   r2   r3   latent)r   r   )frames)3popr   r7   rB   sample_sizer;   r   r   r   r   r>   r[   r\   rA   r]   _execution_devicer   getr~   rq   rJ   ra   r   r   r8   set_timesteps	timestepsin_channelsr   rV   r   free_init_enabled_free_init_num_itersrange_apply_free_initr   orderprogress_bar	enumeratescale_model_inputr   r   r   prev_samplelocalsupdater   rD   postprocess_videomaybe_free_model_hooksr+   )0rE   ro   r   r   r   r   r   rr   r   r   r   r1   r2   r3   r   r   r   r   r   rJ   r   r   kwargsr   r   rs   rW   text_encoder_lora_scaler   r   r   r   r   num_free_init_itersfree_init_iternum_warmup_stepsr   itlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr   video_tensors0                                                   rG   __call__zAnimateDiffPipeline.__call__)  sB   L ::j$//$4d;;|  
 %  C   O49+7$:OOM)58MM ! 	"#.	
 	
 	
  .#'=$ *VS"9"9JJJvt$<$<VJJ&,Q/J' ?C>Y>eD'++GT:::ko 	  150B0B!,'#9.n 1C 
1
 
1
-- + 	O!I'=}&MNNM'+B+N?? '220 L 	$$%8$HHHN,	  $y/;&&.. 

 

 !::9cJJ
  +/F/R \** 	 <@;QXd77WX#$788 /	4 /	4N% %)%:%:^-@&'-Yb& &" #&i..D"9~~0CdnFZ0ZZ "")<"== %4%i00 $4 $4DAqEIEe)rG9q=)A)A)Akr&)-)I)IJ\^_)`)`& "&*.;/E*; "+ " "   7 p=G=M=Ma=P=P:)?%6?]nKn9o%o
 2dn1*a^^L]^^jG+7*,!C = =A17!OA..+?+?aO+\+\("2"6"6y'"J"J(8(<(<_m(\(\1A1E1EF^`v1w1w. C	NNQ...AE=M3M3MSTWXSX\`\j\pRptuRuRu$++---#/A4F!4K4K$HQ7333I$4%4 %4 %4 %4 %4 %4 %4 %4 %4 %4 %4 %4 %4 %4 %4P (""EE..w77L(::[f:ggE 	##%%% 	8O(6666s   ;FQQ	Q	)NN)NNNNNr   )NNNNNN)6__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r
   r   r   r   r   r   r   r   r   r   r   r   r	   r   r=   r   ra   r   r   r   r~   r   r   r   r   r   r   propertyr   rJ   rq   r   r   no_gradr!   EXAMPLE_DOC_STRINGr[   r   	Generatorr   boolr   r   r   r  __classcell__)rF   s   @rG   r-   r-   D   s        8 ESSSTTT" 157;!!g !g!g $!g !	!g
 (/9:!g &!g  "+')
!g .!g  5!!g !g !g !g !g !gT 049=&*#'t5 t5  -t5 !) 6t5 UOt5 C=t5 t5 t5 t5n5 5 5 521 1 1h
 
 
! ! !0 # $+/C C C CN nr   2 $ $ X$   X ( ( X( , , X, # # X# U]__122 )-$& $##% #;?/0MQ*.049=9=@D%* ;?#'KO9B-O7 O7c49n%O7 SMO7 	O7
 }O7 !O7 O7 "%T#Y"78O7  (}O7 O7 E%/43H"HIJO7 %,'O7  -O7 !) 6O7 ##56O7  "*$u|*<!=!O7" c]#O7$ %O7& !)c3h 8'O7( C=)O7* 'xc40@$0F'GH+O7, -1I-O7 O7 O7 32 _O7 O7 O7 O7 O7rH   r-   );r   typingr   r   r   r   r   r   ra   transformersr	   r
   r   r   image_processorr   loadersr   r   r   modelsr   r   r   r   models.lorar   models.unets.unet_motion_modelr   
schedulersr   r   r   r   r   r   utilsr   r   r    r!   r"   r#   utils.torch_utilsr$   rD   r%   free_init_utilsr'   pipeline_utilsr(   r)   pipeline_outputr+   
get_loggerr  rd   r  r-   r   rH   rG   <module>r.     sQ    = = = = = = = = = = = = = = = =  h h h h h h h h h h h h 1 1 1 1 1 1 S S S S S S S S S S [ [ [ [ [ [ [ [ [ [ [ [ 9 9 9 9 9 9 ; ; ; ; ; ;                               . - - - - - - - - - - - + + + + + + D D D D D D D D 6 6 6 6 6 6 
	H	%	% "v7 v7 v7 v7 v7v7 v7 v7 v7 v7rH   