
    wiIP                         d dl mZmZmZmZmZ d dlZd dlZd dl	m
Z
mZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZmZ d
dlmZ d
dlmZ  ej        e          ZdZ G d de          ZdS )    )CallableDictListOptionalUnionN)CLIPTextModelCLIPTokenizer   )DDPMWuerstchenScheduler)	deprecateloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutput   )PaellaVQModel)WuerstchenDiffNeXta  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import WuerstchenPriorPipeline, WuerstchenDecoderPipeline

        >>> prior_pipe = WuerstchenPriorPipeline.from_pretrained(
        ...     "warp-ai/wuerstchen-prior", torch_dtype=torch.float16
        ... ).to("cuda")
        >>> gen_pipe = WuerstchenDecoderPipeline.from_pretrain("warp-ai/wuerstchen", torch_dtype=torch.float16).to(
        ...     "cuda"
        ... )

        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
        >>> prior_output = pipe(prompt)
        >>> images = gen_pipe(prior_output.image_embeddings, prompt=prompt)
        ```
c                   ^    e Zd ZdZdZg dZ	 d&dededede	d	e
d
eddf fdZd Z	 d'dZed             Zed             Zed             Z ej                     ee          ddddddddddddgfdeej        eej                 f         deeee         f         dedeee                  dedeeeee         f                  ded eeej        eej                 f                  deej                 d!ee         d"ed#eeeee gdf                  d$ee         fd%                        Z! xZ"S )(WuerstchenDecoderPipelineaR  
    Pipeline for generating images from the Wuerstchen model.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        tokenizer (`CLIPTokenizer`):
            The CLIP tokenizer.
        text_encoder (`CLIPTextModel`):
            The CLIP text encoder.
        decoder ([`WuerstchenDiffNeXt`]):
            The WuerstchenDiffNeXt unet decoder.
        vqgan ([`PaellaVQModel`]):
            The VQGAN model.
        scheduler ([`DDPMWuerstchenScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
        latent_dim_scale (float, `optional`, defaults to 10.67):
            Multiplier to determine the VQ latent space size from the image embeddings. If the image embeddings are
            height=24 and width=24, the VQ latent shape needs to be height=int(24*10.67)=256 and
            width=int(24*10.67)=256 in order to match the training conditions.
    ztext_encoder->decoder->vqgan)latentstext_encoder_hidden_statesnegative_prompt_embedsimage_embeddingsףp=
W%@	tokenizertext_encoderdecoder	schedulervqganlatent_dim_scalereturnNc                     t                                                       |                     |||||           |                     |           d S )N)r   r   r   r    r!   )r"   )super__init__register_modulesregister_to_config)selfr   r   r   r    r!   r"   	__class__s          /root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.pyr&   z"WuerstchenDecoderPipeline.__init__S   sg     	% 	 	
 	
 	
 	1ABBBBB    c                     |t          ||||          }n:|j        |k    rt          d|j         d|           |                    |          }||j        z  }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r)   r1   r0   r/   r.   r   r    s          r+   prepare_latentsz)WuerstchenDecoderPipeline.prepare_latentsg   sr    ?"5IfTYZZZGG}%% !c'-!c!c\a!c!cdddjj((GI66r,   c                    t          |t                    rt          |          nd}|                     |d| j        j        dd          }|j        }|j        }	|                     |dd          j        }
|
j        d         |j        d         k    rt          j	        ||
          s| j        
                    |
d d | j        j        dz
  df                   }t                              d	| j        j         d
|            |d d d | j        j        f         }|	d d d | j        j        f         }	|                     |                    |          |	                    |                    }|j        }|                    |d          }d }|r^|dg|z  }nt#          |          t#          |          ur0t%          dt#          |           dt#          |           d          t          |t&                    r|g}n>|t          |          k    r)t)          d| dt          |           d| d| d	          |}|                     |d| j        j        dd          }|                     |j                            |          |j                            |                    }|j        }|j        d         }|                    d|d          }|                    ||z  |d          }||fS )Nr   
max_lengthTpt)paddingr7   
truncationreturn_tensorslongest)r9   r;   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )attention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancelistlenr   model_max_length	input_idsr>   r1   torchequalbatch_decodeloggerwarningr   r3   last_hidden_staterepeat_interleavetype	TypeErrorstrr2   repeatview)r)   promptr/   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsr>   untruncated_idsremoved_texttext_encoder_outputr   !uncond_text_encoder_hidden_statesuncond_tokensuncond_input*negative_prompt_embeds_text_encoder_outputseq_lens                      r+   encode_promptz'WuerstchenDecoderPipeline.encode_promptr   s    %/vt$<$<CS[[[!
nn ~6 % 
 
 %.$3..SW.XXb $(<R(@@@UcetIuIu@>66qqq$.JilmJmprJrGr7sttLNNMN3M M>JM M   ,AAA/P1P/P,PQN+AAA/P1P/P,PQN"//0A0A&0I0IZhZkZklrZsZs/tt%8%J"%?%Q%QRgmn%Q%o%o",0)& &	&!#z 1fT/%:%:::(VZ[jVkVk ( (V( ( (   OS11 	0!0 1s?3333 3/ 3 33K_K_ 3 33 30:3 3 3   !0>>$>:# *  L :>9J9J&))&11,B]B`B`agBhBh :K : :6 1[0l- 8=a@G0Q0X0XYZ\qst0u0u-0Q0V0V22GR1 1- *+LLLr,   c                     | j         S N_guidance_scaler)   s    r+   guidance_scalez(WuerstchenDecoderPipeline.guidance_scale   s    ##r,   c                     | j         dk    S )Nr   rf   rh   s    r+   rV   z5WuerstchenDecoderPipeline.do_classifier_free_guidance   s    #a''r,   c                     | j         S re   )_num_timestepsrh   s    r+   num_timestepsz'WuerstchenDecoderPipeline.num_timesteps   s    ""r,      g        r   pilTr   r   rT   num_inference_steps	timestepsri   rW   rU   r.   output_typereturn_dictcallback_on_step_end"callback_on_step_end_tensor_inputsc                     |                     dd          }|                     dd          }|t          ddd           |t          ddd           |At           fd|D                       s&t          d j         d	 fd
|D                         j        } j        j        }| _        t          |t                    s9t          |t                    r|g}n t          dt          |           d           j        rP|Nt          |t                    s9t          |t                    r|g}n t          dt          |           d          t          |t                    rt          j        |d          }t          |t"          j                  r*t          j        ||                              |          }t          |t          j                  s t          dt          |           d          t          |t*                    s t          dt          |           d                               |||                    d          |z   j        |          \  }}|t          j        ||g          n|} j        r(t          j        |t          j        |          g          n|}t+          |                    d           j        j        z            }t+          |                    d           j        j        z            }|                    d          |z  d||f}|8 j                            ||            j        j        }t=          |          }n( j                            ||            j        j        }                     |||||	 j                  }	t=          |dd                    _         tC           "                    |dd                             D ]\  }}|#                    |	                    d                                        |          }                      j        rt          j        |	gdz            n|	 j        rt          j        |gdz            n|||          } j        r3|$                    d          \  }}t          j%        || j&                  } j        '                    |||	|          j(        }	|ni } |D ]}!tS                      |!         | |!<    | |||           }"|"                     d|	          }	|"                     d|          }|"                     d|          }|/||z  dk    r&|tU           j        dd           z  }# ||#||	           |
d!vrt          d"|
           |
d#k    s j+        j        j,        |	z  }	 j+        -                    |	          j.        /                    dd           }$|
d$k    rO|$0                    dddd           1                                2                                3                                }$nl|
d%k    rc|$0                    dddd           1                                2                                3                                }$ 4                    |$          }$n|	}$ 5                                 |s|$S tm          |$          S )&ae  
        Function invoked when calling the pipeline for generation.

        Args:
            image_embedding (`torch.Tensor` or `List[torch.Tensor]`):
                Image Embeddings either extracted from an image or generated by a Prior Model.
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            num_inference_steps (`int`, *optional*, defaults to 12):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
                timesteps are used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 0.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
                linked to the text `prompt`, usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `decoder_guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
            embeddings.
        callbackNcallback_stepsz1.0.0zhPassing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`znPassing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`c              3   *   K   | ]}|j         v V  d S re   _callback_tensor_inputs.0kr)   s     r+   	<genexpr>z5WuerstchenDecoderPipeline.__call__.<locals>.<genexpr>*  sD       F
 F
23A--F
 F
 F
 F
 F
 F
r,   z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                 &    g | ]}|j         v|S  rz   r|   s     r+   
<listcomp>z6WuerstchenDecoderPipeline.__call__.<locals>.<listcomp>.  sV      pH  pH  pHvw  bc  ko  kG  bG  bGpq  bG  bG  bGr,   z2'prompt' must be of type 'list' or 'str', but got rB   z;'negative_prompt' must be of type 'list' or 'str', but got r   r?   )r/   )r0   zI'image_embeddings' must be of type 'torch.Tensor' or 'np.array', but got z5'num_inference_steps' must be of type 'int', but got zo                           In Case you want to provide explicit timesteps, please use the 'timesteps' argument.r   r
      )rq   r/   r=   )reffnetclip)model_outputtimestepsampler.   r   r   r   orderr   )r8   npro   latentzSOnly the output types `pt`, `np`, `pil` and `latent` are supported not output_type=r   r   ro   )7popr   allr2   r{   _execution_devicer   r0   rg   rC   rD   rQ   rP   rO   rV   rH   catr   ndarrayTensorr3   intrc   size
zeros_likeconfigr"   r    set_timestepsrq   rE   r5   rl   	enumerateprogress_barexpandchunklerpri   stepprev_samplelocalsgetattrr!   scale_factordecoder   clamppermutecpufloatnumpynumpy_to_pilmaybe_free_model_hooksr   )%r)   r   rT   rp   rq   ri   rW   rU   r.   r   rr   rs   rt   ru   kwargsrw   rx   r/   r0   prompt_embedsr   r   r   latent_heightlatent_widthlatent_features_shapeitratiopredicted_latentspredicted_latents_textpredicted_latents_uncondcallback_kwargsr~   callback_outputsstep_idximagess%   `                                    r+   __call__z"WuerstchenDecoderPipeline.__call__   s   T ::j$//$4d;;z  
 %  A   .9# F
 F
 F
 F
7YF
 F
 F
 C
 C
9  JTEa  J  J  pH  pH  pH  pH  |^  pH  pH  pH  J  J  
 '"- &$'' 	f&#&& f  dUYZ`UaUa d d deee+ 	*:ot3T3T*os33 '6&7OO#nVZ[jVkVknnn   &-- 	B$y)9qAAA&
33 	]$|,<VLLLOOV[O\\*EL99 	u\`aq\r\ruuu   -s33 	qM`HaHa q q q   150B0B!!!$$'<<,1
 1
-- CYBdEI}&<=>>>jw 	#
 /"EI')9:J)K)KLMMM! 	 ,11!44t{7SSTT+0033dk6RRSS!1!6!6q!9!9<Q!QSTVceq r  N((9V(LLL0I"%i..N(()<V(LLL0I &&'<eVYX_aeaopp ")CRC.11d//	#2#??@@ %	/ %	/DAqHHW\\!__--0077E $,0,LY	7)a-(((RY,0,LW%)UGaK(((RW/	 !- ! ! / vCTCZCZ[\C]C]@&(@$)J/GI_aeat$u$u! n)).#	 *  
   $/"$; 5 5A)/!OA&&#7#7aO#T#T *..y'BB#3#7#78JL\#]#] -=-A-A02L. .* #N(:a(?(? C CC1g...;;;sfqss   h&&j'4w>GZ&&w//6<<QBBFd""1a337799??AAGGII%%1a337799??AAGGII**622F 	##%%% 	M"6***r,   )r   re   )#__name__
__module____qualname____doc__model_cpu_offload_seqr{   r	   r   r   r   r   r   r&   r5   rc   propertyri   rV   rm   rH   no_gradr   EXAMPLE_DOC_STRINGr   r   r   rQ   r   r   	Generatorboolr   r   r   __classcell__)r*   s   @r+   r   r   3   s        . ;   #(C C C $C $	C
 +C C  C 
C C C C C C(	 	 	" PM PM PM PMd $ $ X$ ( ( X( # # X# U]__122 )-#%+/ #;?%&MQ*.%* KO9Bd+ d+d5<.@ @Ad+ c49n%d+ !	d+
 DK(d+ d+ "%T#Y"78d+  #d+ E%/43H"HIJd+ %,'d+ c]d+ d+ 'xc40@$0F'GHd+ -1Id+ d+ d+ 32 _d+ d+ d+ d+ d+r,   r   ) typingr   r   r   r   r   r   r   rH   transformersr   r	   
schedulersr   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r   modeling_paella_vq_modelr   modeling_wuerstchen_diffnextr   
get_loggerr   rK   r   r   r   r,   r+   <module>r      s7   9 8 8 8 8 8 8 8 8 8 8 8 8 8      5 5 5 5 5 5 5 5 1 1 1 1 1 1 B B B B B B B B B B - - - - - - C C C C C C C C 3 3 3 3 3 3 < < < < < < 
	H	%	% (C+ C+ C+ C+ C+ 1 C+ C+ C+ C+ C+r,   