
    wiE                         d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
 ddlmZmZ ddlmZmZ ddlmZ d	d
lmZmZ ddlmZ  ej        e          ZdZddZ G d de          ZdS )    )CallableListOptionalUnionN)XLMRobertaTokenizer   )UNet2DConditionModelVQModel)DDIMSchedulerDDPMScheduler)loggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutput   )MultilingualCLIPav  
    Examples:
        ```py
        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
        >>> import torch

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-2-1-prior")
        >>> pipe_prior.to("cuda")

        >>> prompt = "red cat, 4k photo"
        >>> out = pipe_prior(prompt)
        >>> image_emb = out.image_embeds
        >>> negative_image_emb = out.negative_image_embeds

        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
        >>> pipe.to("cuda")

        >>> image = pipe(
        ...     prompt,
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=negative_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=100,
        ... ).images

        >>> image[0].save("cat.png")
        ```
   c                 z    | |dz  z  }| |dz  z  dk    r|dz  }||dz  z  }||dz  z  dk    r|dz  }||z  ||z  fS )Nr   r   r    )hwscale_factornew_hnew_ws        /root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky.pyget_new_h_wr   B   sm    q E<?a
q E<?a
<!555    c            !       4    e Zd ZdZdZdedededee	e
f         def
 fdZd	 Z	 d"dZ ej                     ee          	 	 	 	 	 	 	 	 	 	 	 	 d#deeee         f         deej        eej                 f         deej        eej                 f         deeeee         f                  dedededededeeej        eej                 f                  deej                 dee         deeeeej        gd
f                  ded efd!                        Z xZS )$KandinskyPipelinea1  
    Pipeline for text-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
    ztext_encoder->unet->movqtext_encoder	tokenizerunet	schedulermovqc                     t                                                       |                     |||||           dt          | j        j        j                  dz
  z  | _        d S )N)r"   r#   r$   r%   r&   r   r   )super__init__register_moduleslenr&   configblock_out_channelsmovq_scale_factor)selfr"   r#   r$   r%   r&   	__class__s         r   r)   zKandinskyPipeline.__init__b   sr     	% 	 	
 	
 	
 "#s49+;+N'O'ORS'S!Tr   c                     |t          ||||          }n:|j        |k    rt          d|j         d|           |                    |          }||j        z  }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r/   r5   r4   r3   r2   latentsr%   s          r   prepare_latentsz!KandinskyPipeline.prepare_latentsv   sr    ?"5IfTYZZZGG}%% !c'-!c!c\a!c!cdddjj((GI66r   Nc                    t          |t                    rt          |          nd}|                     |dddddd          }|j        }|                     |dd          j        }	|	j        d	         |j        d	         k    rrt          j        ||	          s]| j                            |	d d | j        j	        dz
  d	f                   }
t                              d
| j        j	         d|
            |                    |          }|j                            |          }|                     ||          \  }}|                    |d          }|                    |d          }|                    |d          }|r|dg|z  }nt!          |          t!          |          ur0t#          dt!          |           dt!          |           d          t          |t$                    r|g}n>|t          |          k    r)t'          d| dt          |           d| d| d	          |}|                     |dddddd          }|j                            |          }|j                            |          }|                     ||          \  }}|j        d         }|                    d|          }|                    ||z  |          }|j        d         }|                    d|d          }|                    ||z  |d	          }|                    |d          }t          j        ||g          }t          j        ||g          }t          j        ||g          }|||fS )Nr   
max_lengthTM   pt)padding
truncationr<   return_attention_maskadd_special_tokensreturn_tensorslongest)r?   rC   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )	input_idsattention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)r?   r<   r@   rA   rB   rC   )
isinstancelistr+   r#   rF   r5   torchequalbatch_decodemodel_max_lengthloggerwarningr7   rG   r"   repeat_interleavetype	TypeErrorstrr6   repeatviewcat)r/   promptr3   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_text	text_maskprompt_embedstext_encoder_hidden_statesuncond_tokensuncond_inputuncond_text_input_idsuncond_text_masknegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lens                        r   _encode_promptz KandinskyPipeline._encode_prompt   sC    %/vt$<$<CS[[[!
nn "&# % 
 
 %...SW.XXb $(<R(@@@UcetIuIu@>66qqq$.JilmJmprJrGr7sttLNNMN3M M>JM M  
 (**622.11&99	484E4E$Y 5F 5
 5
11 &778MST7UU%?%Q%QRgmn%Q%o%o"//0E1/MM	& 9	A&!#z 1fT/%:%:::(VZ[jVkVk ( (V( ( (   OS11 	0!0 1s?3333 3/ 3 33K_K_ 3 33 30:3 3 3   !0>>$&*#'# *  L %1$:$=$=f$E$E!+:==fEEHLHYHY/@P IZ I IE"$E -215G%;%B%B1F[%\%\"%;%@%@NcAcel%m%m"7=a@G0Q0X0XYZ\qst0u0u-0Q0V0V22GR1 1-  0AABW]^A__ "I'=}&MNNM).4UWq3r)s)s&	#3Y"?@@I8)CCr      d         @r   pilTr[   image_embedsnegative_image_embedsr^   heightwidthnum_inference_stepsguidance_scaler\   r2   r9   output_typecallbackcallback_stepsreturn_dictc                 j   t          |t                    rd}nDt          |t                    rt          |          }nt	          dt          |                     | j        }||	z  }|dk    }|                     |||	||          \  }}}t          |t                    rt          j	        |d          }t          |t                    rt          j	        |d          }|r`|
                    |	d          }|
                    |	d          }t          j	        ||gd                              |j        |          }| j                            ||           | j        j        }| j        j        j        }t'          ||| j                  \  }}|                     ||||f|j        ||
|| j                  }t-          |                     |                    D ]g\  }}|rt          j	        |gdz            n|}||d	}|                     ||||d
          d         }|rx|                    |j        d         d          \  }}|                    d          \  }}|                    d          \  }} ||||z
  z  z   }t          j	        || gd          }t7          | j        j        d          r| j        j        j        dv s%|                    |j        d         d          \  }}| j                            ||||
          j        }|/||z  dk    r&|t?          | j        dd          z  }! ||!||           i| j         !                    |d          d         }"| "                                 |dvrt	          d|           |dv rl|"dz  dz   }"|"#                    dd          }"|"$                                %                    dddd          &                                '                                }"|dk    r| (                    |"          }"|s|"fS tS          |"          S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
                The clip image embeddings for text prompt, that will be used to condition the image generation.
            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
                The clip image embeddings for negative text prompt, will be used to condition the image generation.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        r   z2`prompt` has to be of type `str` or `list` but is g      ?r   rH   )r4   r3   )r3   r   )text_embedsrs   F)sampletimestepencoder_hidden_statesadded_cond_kwargsr|   variance_type)learnedlearned_range)r2   NorderT)force_not_quantizer   )r>   nprr   zIOnly the output types `pt`, `pil` and `np` are supported not output_type=)r   rr   g      ?r   rr   )images)*rL   rW   rM   r+   r6   rU   _execution_devicern   rN   rZ   rT   r7   r4   r%   set_timesteps	timestepsr$   r,   in_channelsr   r.   r:   	enumerateprogress_barsplitr5   chunkhasattrr   stepprev_samplegetattrr&   decodemaybe_free_model_hooksclampcpupermutefloatnumpynumpy_to_pilr   )#r/   r[   rs   rt   r^   ru   rv   rw   rx   r\   r2   r9   ry   rz   r{   r|   r_   r3   r]   re   rf   _timesteps_tensornum_channels_latentsitlatent_model_inputr   
noise_predvariance_prednoise_pred_uncondnoise_pred_textvariance_pred_textstep_idximages#                                      r   __call__zKandinskyPipeline.__call__   s   R fc"" 	bJJ%% 	bVJJ`RVW]R^R^``aaa'"77
&4s&:#7;7J7JF13NP_8
 8
411 lD)) 	: 9\q999L+T22 	L$)I.C$K$K$K!& 	'99:OUV9WWL$9$K$KLagh$K$i$i! 9&;\%JPQRRRUU#)& V  L 	$$%8$HHH>3#y/;#FE43IJJ &&-vu=&,N
 
 d//0@AABB $	/ $	/DAq=X!eG9q=!9!9!9^e0=| \ \)&@"3! #   J + P,6,<,<W]1=MST,<,U,U)
M5?5E5Ea5H5H2!?(5(;(;A(>(>%%.?UfCf1gg
"Y
4F'GQOOO
 -??JN)7;WWW * 0 0q1Aq 0 I I
A n))#	 *  
   #N(:a(?(? C CC1g... 	  T BB8L##%%%111vitvvwww-''CK#%EKK1%%EIIKK''1a3399;;AACCE%%%e,,E 	8O"%0000r   )N)Nro   ro   rp   rq   r   NNrr   Nr   T)__name__
__module____qualname____doc__model_cpu_offload_seqr   r   r	   r   r   r   r
   r)   r:   rn   rN   no_gradr   EXAMPLE_DOC_STRINGrW   r   Tensorr   intr   	Generatorr   boolr   __classcell__)r0   s   @r   r!   r!   L   s\        & 7U&U 'U #	U
 56U U U U U U U(	 	 	" dD dD dD dDL U]__122 <@#& #%&MQ*.%*GK !n1 n1c49n%n1 EL$u|*<<=n1  %U\43E%EF	n1
 "%T#Y"78n1 n1 n1 !n1 n1  #n1 E%/43H"HIJn1 %,'n1 c]n1 8S#u|$<d$BCDn1 n1  !n1 n1 n1 32 _n1 n1 n1 n1 n1r   r!   )r   )typingr   r   r   r   rN   transformersr   modelsr	   r
   
schedulersr   r   utilsr   r   utils.torch_utilsr   pipeline_utilsr   r   r"   r   
get_loggerr   rR   r   r   r!   r   r   r   <module>r      sY   3 2 2 2 2 2 2 2 2 2 2 2       4 3 3 3 3 3 3 3 6 6 6 6 6 6 6 6        . - - - - - C C C C C C C C * * * * * * 
	H	%	% >6 6 6 6K1 K1 K1 K1 K1) K1 K1 K1 K1 K1r   